In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/RAG_db.ipynb"),
            '..'
        )
    )
)

# import custom modules
from utils.preprocessing import *
from utils.json_parser import *
from utils.content_page_parser import *
from utils.retriever import *
from utils.custom_print import *
from utils.prompt_engineering import *
from utils.db_utils import *
from utils.langsmith_trace import *
from utils.initialisations import *

In [3]:
# RAG input parameters =======================================================
# AGO
question = "What are the findings pertaining to grant?"
# question = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report."
# question = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings."
# question = "What are the findings pertaining to MOE?"
# question = "What are the areas for improvement in administration of Civil Service Medical and Dental Benifits?"

# HYPERPARAMETERS ============================================================

# preprocessing --------------------------------------------------------------

# Chunk into sentences ('s') or paragraphs ('p') or fixed-size strings ('f')
chunking='s' 

# Group smaller chunks into a bigger chunk
grouping=1

# control minimum chunk size
min_chunk_size=100

# vector store ---------------------------------------------------------------

# add to data base in batches
batch_size = 1000

# Ranking --------------------------------------------------------------------

# top k matches for ranking. 
# Both sparse and dense search find top_k matches so hybrid search will return 
# at least top_k matches and most 2 * top_k matches
top_k = 30

# weights for each retrieval for reciprocal rank fusion
weights = [0.5, 0.5]

# reciprocal ranking fusion constant
k = 60

# Reranking ------------------------------------------------------------------

# top n matches for reranking
top_n = 20

# Cross encoder model

# claimed to be deprecated because it is bad but seems to still work fine
# model_name = "cross-encoder/stsb-roberta-base"

# best performing on Microsoft tests
model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"


# IMPROVEMENTS ===============================================================
HyDE = False
if HyDE:
    comments = "This is using HyDE"
else:
    comments = "None"

# pack parameters for tracing ================================================
params = (question, chunking, grouping, min_chunk_size, batch_size, top_k, 
          weights, k, top_n, model_name, HyDE, comments)


In [4]:
# RUN ONCE
# generate all required data structures

# generate chunks
generate_chunks(DOCUMENT_DIR,
                chunks_path,
                chunk_pageNum_pairs_path,
                s_p_pairs_path, 
                chunking, 
                grouping, 
                min_chunk_size,
                DOC_IDENTIFIER)


# generate inverted tree
has_content_page = True
generate_inverted_tree(chunk_pageNum_pairs_path, 
                       has_content_page, 
                       save_inverted_tree_path,
                       tree_path)

2.5298590660095215 seconds
number of chunks: 9127


In [5]:
# RUN ONCE
# retrieve all required data structures

# load tree
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

# prepare metadata for chromadb
pre_metadata = list(inverted_tree.values())
metadata = chroma_preprocess_metadata(pre_metadata)

# load sentence paragraph pairs
if (chunking == 's' or chunking == 'f') and grouping == 1:
    print("s_p_pairs will be filled")
    s_p_pairs = json_file_to_dict(s_p_pairs_path)
else:
    s_p_pairs = {}

Number of unique chunks: 8210
s_p_pairs will be filled


In [6]:
# RUN ONCE
# set up vector database for dense embedding search

# chromadb supported model
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

# create db
client_dense = chromadb.PersistentClient(path="../data/db", 
                                   settings = Settings(allow_reset=True))

# chromadb's embedding function needs streaming
collection = chroma_get_or_create_collection(client_dense, 
                                             name = "audit", 
                                             embedding_function = openai_ef, 
                                             reset = True)

# fill db
chroma_fill_db(collection, chunks, metadata, batch_size)
print("number of embeddings in database:",collection.count())

number of embeddings in database: 8210


In [8]:
# RUN ONCE
# set up elasticseach for sparse embedding search

# create docker container for elasticsearch on terminal/shell
'''
docker run -p 127.0.0.1:9200:9200 -d --name elasticsearch --network elastic-net \
  -e ELASTIC_PASSWORD=$ELASTIC_PASSWORD \
  -e "discovery.type=single-node" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.14.3
'''

# connect to the Elasticsearch cluster from python elasticsearch client
client_sparce = Elasticsearch(
    LOCAL_HOST_URL,
    basic_auth=HTTP_AUTH
)
# checks if client is connected to docker container
print(client_sparce.info(http_auth=HTTP_AUTH))

# index chunks using elasticsearch (saved in docker)
index_elastic_db(client_sparce, index_name, HTTP_AUTH, chunks, reset = True)

{'name': '50cd72118574', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'l-puMSQDTvqL7nQDfYreOg', 'version': {'number': '8.14.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0', 'build_date': '2024-07-07T22:04:49.882652950Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
reset index: chromadb_documents


In [9]:
if HyDE:
    query = gpt4o(hyde(question), llm)

else:
    query = question

In [10]:
# Can be run multiple times when change in query
# 1st ranking

# dense embeding search 
embedding_results = chromadb_embedding_search(collection, query, top_k)

# perform bm_25 using elasticsearch
bm25_results = bm25_elasticsearch(client_sparce, index_name, HTTP_AUTH, query, top_k)


# RRF
good_chunks  = reciprocal_rank_fusion(bm25_results, 
                                 embedding_results, 
                                 weights, 
                                 k)

print("COMBINED RANKING---------------------------------------------------------\n")
pretty_print_list(good_chunks)

COMBINED RANKING---------------------------------------------------------

idx: 0

Details of the lapses pertaining to the enforcement of SDL collections are in the 
 
following paragraphs

----------------------------------------------------------

idx: 1

Stage 1: Grant Design and Setup
– whether there were processes and controls in place to ensure that 
grant programmes were authorised and administered in accordance 
with the objective(s) of the grant

----------------------------------------------------------

idx: 2

Audit findings are conveyed by AGO to the ministries and statutory boards audited 
by way of “management letters”

----------------------------------------------------------

idx: 3

Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b

----------------------------------------------------------

idx: 4

Pertaining to the lack of checks on declarations by grant recipients, EDB 
e

In [11]:
embedding_list, embedding_rank = zip(*embedding_results)
print("DENSE RETRIEVAL RANKING---------------------------------------------------------\n")
pretty_print_list(embedding_list)

DENSE RETRIEVAL RANKING---------------------------------------------------------

idx: 0

Stage 1: Grant Design and Setup
– whether there were processes and controls in place to ensure that 
grant programmes were authorised and administered in accordance 
with the objective(s) of the grant

----------------------------------------------------------

idx: 1

Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b

----------------------------------------------------------

idx: 2

Stage 2: Grant Evaluation and Approval
– whether there were processes and controls in place to ensure that 
grant applications were properly evaluated and approved

----------------------------------------------------------

idx: 3

Stage 1 – Grant Design and Setup
AGO observed that the grant eligibility criteria and operational requirements for 
the administration of the grant schemes were properly laid down in legislation

In [12]:
bm25_list, bm25_rank = zip(*bm25_results)
print("SPARSE RETRIEVAL RANKING---------------------------------------------------------\n")
pretty_print_list(bm25_list)

SPARSE RETRIEVAL RANKING---------------------------------------------------------

idx: 0

Details of the lapses pertaining to the enforcement of SDL collections are in the 
 
following paragraphs

----------------------------------------------------------

idx: 1

Audit findings are conveyed by AGO to the ministries and statutory boards audited 
by way of “management letters”

----------------------------------------------------------

idx: 2

Pertaining to the lack of checks on declarations by grant recipients, EDB 
explained that there were specific controls in place to ensure that grant recipients take 
ownership for accurate and credible reporting

----------------------------------------------------------

idx: 3

Audit findings are conveyed to the Government ministries, statutory boards and other 
entities audited by way of “management letters”

----------------------------------------------------------

idx: 4

These are 
typically the more significant findings in terms of mone

In [13]:
# Can be run multiple times when change in query
# Reranking
best_chunks, scores = reranking(model_name, good_chunks, query, top_n)

print("RERANKING-----------------------------------------------------------\n")
pretty_print_rank(best_chunks, scores)

RERANKING-----------------------------------------------------------

RANK: 1

Pertaining to the lack of checks on declarations by grant recipients, EDB 
explained that there were specific controls in place to ensure that grant recipients take 
ownership for accurate and credible reporting

SCORE: 3.238358

----------------------------------------------------------

RANK: 2

The audit examined whether there was a proper framework for grant 
management and whether due process was followed for the above stages

SCORE: -0.9363227

----------------------------------------------------------

RANK: 3

The audit examined whether there was a proper framework for grant 
management and whether due process was followed for the above stages by the two 
agencies

SCORE: -1.0825868

----------------------------------------------------------

RANK: 4

Stage 4: Grant Monitoring and Review
–	
Whether there were processes and controls in place to ensure 
that grants were managed in accordance with relev

In [14]:
# Can be run multiple times when change in query
prompt = generate_prompt(question, 
                         inverted_tree, 
                         best_chunks, 
                         chunking, 
                         s_p_pairs)

print(prompt)

Role:
You are a specialist who uses the context provided to answer the query.

Instruction:
Your response should cite sources' year and page number.
If possible, make ministries or government agencies the headings.
If you are unable to provide an answer, state "Unable to find, submit prompt again."

Background:
The context is taken from audit reports from the Auditor-General's Office (AGO) of Singapore. 
AGO is an independent organ of state and the national auditor. They play an important role in enhancing public accountability in the management and use of public funds and resources through their audits.

They audit
    government ministries and departments
    organs of state
    statutory boards
    government funds
    other public authorities and bodies administering public funds (upon their request for audit), e.g. government-owned companies.

They report their audit observations to the President, Parliament and the public through the Annual Report of the Auditor-General managemen

In [15]:
# as comparison
# Can be run multiple times when change in query
bad_prompt = generate_bad_prompt(
    question, 
    best_chunks, 
    chunking, 
    s_p_pairs
)

print(bad_prompt)

Pertaining to the lack of checks on declarations by grant recipients, EDB 
explained that there were specific controls in place to ensure that grant recipients take 
ownership for accurate and credible reporting.  These included sample checks with 
onsite visits by its Internal Audit, and conduct of site visits by its Cluster Groups for 
those incentive schemes involving support for equipment or materials.  AGO noted 
that the site visits by EDB’s Cluster Groups would apply to five of the nine schemes 
audited by AGO

The audit examined whether there was a proper framework for grant 
management and whether due process was followed for the above stages.  The audit 
did not seek to certify whether the grant recipients had, in all material aspects, used or 
managed the grants in accordance with the grant terms and conditions.  For grants 
which were managed by WSG and ESG jointly with their programme partners (PPs) 
such as Trade Associations and Chambers (TACs), the audit focus was on WS

In [16]:
# response = rag_pipeline(params, prompt, llm)
# I get a deadlock error when placed in another file. Is this ok?
# yes, it's ok. Doesn't affect the LLM outputs. So just disable parallelism since not needed 
'''
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
'''



In [17]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

response = rag_pipeline(params, prompt, llm)

### Economic Development Board (EDB)
**Year: 2016_17**
- **Context**: Lapses in Administration of Grants
- **Findings**: EDB had specific controls to ensure accurate and credible reporting by grant recipients, including sample checks with onsite visits by its Internal Audit and site visits by its Cluster Groups. However, these site visits would only apply to five of the nine schemes audited by AGO (Page 48).

### Ministry of Trade and Industry, SkillsFuture Singapore Agency (SSG)
**Year: 2021_22**
- **Context**: Lapses in Management of Grants
- **Findings**: AGO found inadequate monitoring and checks by SSG and its outsourced service provider to ensure that grants disbursed to individuals, employers, and training providers were valid, correct, and compliant with the grant terms and conditions (Page 38).

### Ministry of Sustainability and the Environment, National Environment Agency (NEA)
**Year: 2021_22**
- **Context**: Possible Irregularities in Quotations Submitted for Grant Applications
- **Findings**: The grant scheme aims to raise operational efficiency and productivity of the environmental services industry through technology adoption. Applicants are required to identify the equipment or digital solution they intend to procure and submit multiple quotations to demonstrate cost reasonableness (Page 46).

### Ministry of Transport, Civil Aviation Authority of Singapore (CAAS)
**Year: 2022_23**
- **Context**: Lapses in Management of Grants
- **Findings**: For a certain grant scheme, some eligibility criteria were either inaccurately stated or not included in the grant agreements with two companies (Page 42).

### Thematic Audits
**Year: 2017_18**
- **Context**: Thematic Audit on Management of R&D Grants
- **Findings**: Generally, there were established processes for grant application, evaluation, and award in the two public sector entities that underwent the audit. However, there is a need to strengthen controls in areas such as monitoring and review of progress/final reports and audit reports, and the recovery of unutilised funds (Page 4).

**Year: 2018_19**
- **Context**: Thematic Audit on Social Grant Programmes managed by MOH and MSF
- **Findings**: AGO noted gaps in management of these programmes. Specifically, $1.59 billion was disbursed over a two-year period. AGO test-checked grants covering $488.52 million and found gaps in five stages of grant management, from design and setup to cessation of grants (Page 7).

**Year: 2019_20**
- **Context**: Thematic Audit Summary
- **Findings**:
  - **Stage 1: Grant Design and Setup**: Checks to ensure that grant programmes were authorised and administered according to objectives (Page 53).
  - **Stage 2: Grant Evaluation and Approval**: Processes to ensure proper evaluation and approval of grant applications and that agreements with grant recipients were properly entered into (Page 53).
  - **Stage 4: Grant Monitoring and Review**: Ensuring grants were managed according to terms and conditions and that deliverables were achieved (Page 54).
  - **Inadequate assessment**: Noted instances where proposed costs were inadequately assessed, and eligibility of grant applicants not verified. Instances where grant requirements might have been circumvented were also observed (Page 57).

**Year: 2022_23**
- **Context**: COVID-19 Related Grants
- **Findings**:
  - **Stage 2: Grant Evaluation and Approval**: Processes for proper evaluation and stipulation of terms and conditions for compliance were assessed (Page 48).
  - **Stage 4: Grant Monitoring and Review**: Processes to ensure grants were managed in accordance with relevant terms and conditions and that deliverables were achieved (Page 49).

### General Findings
- **Year: 2014_15**
  - **Context**: General Observations on Grants
  - **Findings**: Numerous instances of laxity in the administration of grants were uncovered, pointing to a need for public sector entities to ensure correct amounts are disbursed and grant conditions are adhered to (Page 3).

- **Year: 2018_19**
  - **Context**: Thematic Audit, Summary
  - **Findings**: Ensuring grant programmes were authorised and reviewed for relevance (Page 54).

Overall, the findings indicate a range of issues from inadequate monitoring and evaluation to incorrect or insufficient documentation and enforcement of grant terms and conditions. Addressing these gaps can enhance accountability and efficiency in managing public funds.

In [18]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

bad_response = rag_pipeline(params, bad_prompt, llm)

The findings pertaining to grant management from the audit conducted by the Auditor-General's Office (AGO) cover several key areas and stages:

### General Observations:
1. **Laxity in Administration**:
   - Instances indicating laxity in administration were uncovered.
   - Common weaknesses included failure to ensure the correct amount of grant disbursement and conditions adherence.

2. **Accountability Importance**:
   - Emphasis on the importance of controls and proper mechanisms to ensure grants are used for intended purposes.

### Specific Findings:
#### Stage 1: Grant Design and Setup
1. **Grant Eligibility Criteria and Operational Requirements**:
   - These were found to be properly laid down in legislation or implementation documents.
   - Proper contracts and agreements were entered into with external parties for key processes.
   - Approval for funding was obtained from the Ministry of Finance (MOF).

2. **Inaccurate or Unstated Eligibility Criteria**:
   - For one grant scheme, AGO noted eligibility criteria either inaccurately stated or not stated in agreements with two companies.

#### Stage 2: Grant Evaluation and Approval
1. **Processes and Controls**:
   - Processes to invite, receive, evaluate, and approve proposals were generally in place.
   - Nevertheless, controls need strengthening, particularly in:
     - Evaluation of grant cases.
     - Stipulating and ensuring compliance with proper terms and conditions.
     - Entering proper agreements with grant recipients.

2. **R&D Grants**:
   - Established processes for application, evaluation, and award.
   - Need for improved controls in monitoring progress/final reports and recovery of unutilized funds.

#### Stage 3: Disbursement of Grants
1. **Accuracy and Timeliness**:
   - Processes should ensure grants are disbursed accurately and timely.

2. **Inadequate Monitoring and Checks**:
   - Found in SkillsFuture Singapore (SSG) and its outsourced service provider regarding compliance with grant terms.

3. **Assessment and Verification**:
   - Inadequate assessment of costs to be supported and verification of applicants' eligibility.
   - Noted that some companies or individuals might have circumvented grant requirements and controls.

#### Stage 4: Grant Monitoring and Review
1. **Processes and Controls**:
   - Processes to ensure management in accordance with relevant terms and deliverables achievement were evaluated.

#### Gaps Identified:
1. **Social Grant Programmes**:
   - Thematic audit on social grants by the Ministry of Health (MOH) and Ministry of Social and Family Development (MSF). 
   - $1.59 billion disbursed, covering 1,058 Programme-Voluntary Welfare Organisations (VWOs) with $488.52 million test-checked.
   - Five stages of grant management audited with emphasis on monitoring and review.

2. **Public Sector Lapses**:
   - In procurement, contract management, and financial administration.
   - Need for improvements noted in grant disbursement and management oversight.

These findings highlight the need for improvements in the administration, monitoring, and overall management of grant programmes to ensure proper accountability and effective utilization of funds.