In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
import sys
import os

# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/RAG_db_NDR.ipynb"),
            '..'
        )
    )
)

# import custom modules
from utils.preprocessing import *
from utils.json_parser import *
from utils.content_page_parser import *
from utils.retriever import *
from utils.custom_print import *
from utils.prompt_engineering import *
from utils.db_utils import *
from utils.langsmith_trace import *
from utils.initialisations import *

In [3]:
# RAG input parameters =======================================================

# National Day Rally
question = "How are the elderly benifitting ?"

# HYPERPARAMETERS ============================================================

# preprocessing --------------------------------------------------------------

# Chunk into sentences ('s') or paragraphs ('p') or fixed-size strings ('f')
chunking='s' 

# Group smaller chunks into a bigger chunk
grouping=1

# control minimum chunk size
min_chunk_size=100

# vector store ---------------------------------------------------------------

# add to data base in batches
batch_size = 1000

# Ranking --------------------------------------------------------------------

# top k matches for ranking. 
# Both sparse and dense search find top_k matches so hybrid search will return 
# at least top_k matches and most 2 * top_k matches
top_k = 30

# weights for each retrieval for reciprocal rank fusion
weights = [0.5, 0.5]

# reciprocal ranking fusion constant
k = 60

# Reranking ------------------------------------------------------------------

# top n matches for reranking
top_n = 20

# Cross encoder model

# claimed to be deprecated because it is bad but seems to still work fine
# model_name = "cross-encoder/stsb-roberta-base"

# best performing on Microsoft tests
model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"


# IMPROVEMENTS ===============================================================
HyDE = False
if HyDE:
    comments = "This is using HyDE"
else:
    comments = "None"

# pack parameters for tracing ================================================
params = (question, chunking, grouping, min_chunk_size, batch_size, top_k, 
          weights, k, top_n, model_name, HyDE, comments)


In [4]:
# RUN ONCE
# generate all required data structures

# generate chunks
generate_chunks(DOCUMENT_DIR_NDR,
                chunks_path,
                chunk_pageNum_pairs_path,
                s_p_pairs_path, 
                chunking, 
                grouping, 
                min_chunk_size,
                DOC_IDENTIFIER_NDR)


# generate inverted tree
has_content_page = False
generate_inverted_tree(chunk_pageNum_pairs_path, 
                       has_content_page, 
                       save_inverted_tree_path,
                       tree_path)

0.15262699127197266 seconds
number of chunks: 177


In [5]:
# RUN ONCE
# retrieve all required data structures

# load tree
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

# prepare metadata for chromadb
pre_metadata = list(inverted_tree.values())
metadata = chroma_preprocess_metadata(pre_metadata)

# load sentence paragraph pairs
if (chunking == 's' or chunking == 'f') and grouping == 1:
    print("s_p_pairs will be filled")
    s_p_pairs = json_file_to_dict(s_p_pairs_path)
else:
    s_p_pairs = {}

Number of unique chunks: 177
s_p_pairs will be filled


In [6]:
# RUN ONCE
# set up vector database for dense embedding search

# chromadb supported model
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

# create db
client_dense = chromadb.PersistentClient(path="../data/db", 
                                   settings = Settings(allow_reset=True))

# chromadb's embedding function needs streaming
collection = chroma_get_or_create_collection(client_dense, 
                                             name = "rally", 
                                             embedding_function = openai_ef, 
                                             reset = True)

# fill db
chroma_fill_db(collection, chunks, metadata, batch_size)
print("number of embeddings in database:",collection.count())

number of embeddings in database: 177


In [17]:
# RUN ONCE
# set up elasticseach for sparse embedding search

# create docker container for elasticsearch on terminal/shell
'''
docker run -p 127.0.0.1:9200:9200 -d --name elasticsearch --network elastic-net \
  -e ELASTIC_PASSWORD=$ELASTIC_PASSWORD \
  -e "discovery.type=single-node" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.14.3
'''

# connect to the Elasticsearch cluster from python elasticsearch client
client_sparce = Elasticsearch(
    LOCAL_HOST_URL,
    basic_auth=HTTP_AUTH
)
# checks if client is connected to docker container
print(client_sparce.info(http_auth=HTTP_AUTH))

# index chunks using elasticsearch (saved in docker)
index_elastic_db(client_sparce, index_name, HTTP_AUTH, chunks, reset = True)

{'name': '50cd72118574', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'l-puMSQDTvqL7nQDfYreOg', 'version': {'number': '8.14.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0', 'build_date': '2024-07-07T22:04:49.882652950Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
reset index: chromadb_documents


In [18]:
if HyDE:
    query = gpt4o(hyde(question), llm)

else:
    query = question

In [19]:
# Can be run multiple times when change in query
# 1st ranking

# dense embeding search 
embedding_results = chromadb_embedding_search(collection, query, top_k)

# perform bm_25 using elasticsearch
bm25_results = bm25_elasticsearch(client_sparce, index_name, HTTP_AUTH, query, top_k)


# RRF
good_chunks  = reciprocal_rank_fusion(bm25_results, 
                                 embedding_results, 
                                 weights, 
                                 k)

print("COMBINED RANKING---------------------------------------------------------\n")
pretty_print_list(good_chunks)

COMBINED RANKING---------------------------------------------------------

idx: 0

This is a new housing
concept that integrates elderly-friendly housing with care and social services

----------------------------------------------------------

idx: 1

Just as we created Workfare instead
of Welfare, what would be better than unemployment insurance? How can we
help in a way that minimises the negative outcomes we have seen elsewhere? 
We discussed this with our brothers and sisters from NTUC and the Labour
Movement who have been championing this initiative for some time

----------------------------------------------------------

idx: 2

Anyone who wants to upgrade their skills – at any point in their lives, no matter
how old – will be given the opportunity to do so in Singapore

----------------------------------------------------------

idx: 3

We are implementing the new nationwide Age Well SG – to enable our seniors
to stay active and age well within their communities

-------------

In [20]:
embedding_list, embedding_rank = zip(*embedding_results)
print("DENSE RETRIEVAL RANKING---------------------------------------------------------\n")
pretty_print_list(embedding_list)

DENSE RETRIEVAL RANKING---------------------------------------------------------

idx: 0

We are implementing the new nationwide Age Well SG – to enable our seniors
to stay active and age well within their communities

----------------------------------------------------------

idx: 1

This is a new housing
concept that integrates elderly-friendly housing with care and social services

----------------------------------------------------------

idx: 2

But the experience has not always been positive – because after getting a
generous benefit, the person might find it more attractive to stay unemployed
than to go back to work

----------------------------------------------------------

idx: 3

Just as we created Workfare instead
of Welfare, what would be better than unemployment insurance? How can we
help in a way that minimises the negative outcomes we have seen elsewhere? 
We discussed this with our brothers and sisters from NTUC and the Labour
Movement who have been championing this 

In [21]:
bm25_list, bm25_rank = zip(*bm25_results)
print("SPARSE RETRIEVAL RANKING---------------------------------------------------------\n")
pretty_print_list(bm25_list)

SPARSE RETRIEVAL RANKING---------------------------------------------------------

idx: 0

This is a new housing
concept that integrates elderly-friendly housing with care and social services

----------------------------------------------------------

idx: 1

But the parent in fact wanted to know about his child’s overall well-being, and
how his child was doing as a person

----------------------------------------------------------

idx: 2

Now you only need one permit for a show, regardless of how many drones you
use!
And no more stickers!
You also save money in the process

----------------------------------------------------------

idx: 3

Anyone who wants to upgrade their skills – at any point in their lives, no matter
how old – will be given the opportunity to do so in Singapore

----------------------------------------------------------

idx: 4

How do we compete and earn our living in such an
environment? 
We have to work even harder to stay competitive, and push the frontiers 

In [22]:
# Can be run multiple times when change in query
# Reranking
best_chunks, scores = reranking(model_name, good_chunks, query, top_n)

print("RERANKING-----------------------------------------------------------\n")
pretty_print_rank(best_chunks, scores)

RERANKING-----------------------------------------------------------

RANK: 1

This is a new housing
concept that integrates elderly-friendly housing with care and social services

SCORE: -9.43601

----------------------------------------------------------

RANK: 2

We are implementing the new nationwide Age Well SG – to enable our seniors
to stay active and age well within their communities

SCORE: -10.325906

----------------------------------------------------------

RANK: 3

Anyone who wants to upgrade their skills – at any point in their lives, no matter
how old – will be given the opportunity to do so in Singapore

SCORE: -10.97282

----------------------------------------------------------

RANK: 4

More fathers are playing a bigger role in their families – even changing diapers
and doing housework!
I had a chat with a colleague from the Ministry of Finance, Arash

SCORE: -11.105978

----------------------------------------------------------

RANK: 5

But employers and businesse

In [33]:
# Can be run multiple times when change in query
# in utils.preprocessing, change 
prompt = generate_prompt(question, 
                         inverted_tree, 
                         best_chunks, 
                         chunking, 
                         s_p_pairs,
                         document = "NDR")

print(prompt)

Role:
You are a journalist in charge of the Singapore National Day Rally (NDR).
You have to use context from the NDR transcription to answer the query.

Instruction:
Your response should cite the source's year. Don't cite the context number.
Categorise your answers and provide headings. 
If you are unable to provide an answer, state "Unable to find, submit prompt again."

Background:
The contexts are taken from a speech given by Singapore Prime Minister during the National Day Rally (NDR). 
NDR is an annual event where the Prime Minister addresses the nation on key issues and policies, and update Singaporeans on the country’s progress.

CONTEXT
Context 0:
Year: 2024
Page number: Introduction
Content: We have introduced the Community Care Apartments. This is a new housing
concept that integrates elderly-friendly housing with care and social services

Context 1:
Year: 2024
Page number: Introduction
Content: We are implementing the new nationwide Age Well SG – to enable our seniors
to sta

In [27]:
# as comparison
# Can be run multiple times when change in query
bad_prompt = generate_bad_prompt(
    question, 
    best_chunks, 
    chunking, 
    s_p_pairs
)

print(bad_prompt)

We have introduced the Community Care Apartments. This is a new housing
concept that integrates elderly-friendly housing with care and social services

We are implementing the new nationwide Age Well SG – to enable our seniors
to stay active and age well within their communities

Anyone who wants to upgrade their skills – at any point in their lives, no matter
how old – will be given the opportunity to do so in Singapore

More fathers are playing a bigger role in their families – even changing diapers
and doing housework!
I had a chat with a colleague from the Ministry of Finance, Arash. He became a
father when he was busy helping me with Budget preparations two years ago

But employers and businesses are concerned about managing manpower gaps
when their employees are away for an extended period

It is not possible for them to pursue their goals, and still carry a heavier share of
the caregiving and housework responsibilities

We are all familiar with the Bay South Garden, this is wher

In [None]:
# response = rag_pipeline(params, prompt, llm)
# I get a deadlock error when placed in another file. Is this ok?
# yes, it's ok. Doesn't affect the LLM outputs. So just disable parallelism since not needed 
'''
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
'''

In [35]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

response = rag_pipeline(params, prompt, llm)

### Elderly Benefits

**Community Care Apartments (2024)**
- The introduction of Community Care Apartments integrates elderly-friendly housing with care and social services. This new housing concept aims to provide a supportive living environment for the elderly, enhancing their quality of life by combining accommodation with necessary care and social support (Context 0, 2024).

**Nationwide Age Well SG Programme (2024)**
- The implementation of the Age Well SG programme is designed to enable seniors to stay active and age well within their communities. This initiative focuses on promoting the well-being of the elderly by encouraging them to remain engaged and healthy (Context 1, 2024).

### General Support for Skills Upgrade

**Opportunities for Lifelong Learning (2024)**
- Additionally, Singapore is enabling individuals of any age to upgrade their skills at any point in their lives. This policy ensures that elderly members of society, who wish to learn new skills or enhance existing ones, have the opportunity to do so, thereby staying productive and engaged (Context 2, 2024).

### Social and Financial Support

**Subsidies for Housing (2024)**
- There are also provisions for subsidies for housing, including standard, Plus, and Prime flats, which can help elderly citizens afford quality housing. These subsidies aim to keep such accommodations within reach for the majority of Singaporeans, ensuring that the elderly have access to suitable living conditions (Context 17, 2024).

In [36]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

bad_response = rag_pipeline(params, bad_prompt, llm)

The elderly in Singapore are benefiting through several initiatives and programmes:

1. **Community Care Apartments**: This new housing concept integrates elderly-friendly housing with care and social services. It is designed to meet the needs of senior citizens, ensuring they have a conducive living environment that supports their health and well-being.

2. **Age Well SG**: Under this nationwide programme, seniors are encouraged to stay active and age well within their communities. This enables them to lead fulfilling lives with activities and opportunities to engage socially, which is crucial for their mental and physical health.

3. **Skills Upgrading Opportunities**: Seniors who wish to upgrade their skills have the opportunity to do so at any point in their lives. This lifelong learning initiative ensures that the elderly can continue to learn and grow, staying relevant and engaged in various aspects of life.

4. **Subsidies for Housing**: Subsidies for Plus and Prime flats ensure housing remains affordable for the elderly, especially in new housing developments.

5. **Training Allowances**: Starting from next year, the SkillsFuture Level-Up programme will provide a new training allowance to aid those seeking to upgrade their skills, including seniors.

These initiatives collectively aim to provide a robust support system that addresses the housing, social, health, and continuous learning needs of the elderly in Singapore, thus promoting a more inclusive and caring society.