In [1]:
import sys
import os

# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/chaining.ipynb"),
            '..'
        )
    )
)

from utils.preprocessing import *
from utils.content_page_parser import *
from utils.initialisations import *
from utils.db_utils import *
from utils.custom_print import *
from utils.retriever import *
from utils.json_parser import *
from utils.prompt_engineering import *
from utils.langsmith_trace import *

from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnablePassthrough

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# RAG input parameters =======================================================

question = "What are the findings pertaining to grant?"
# query = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report."
question2 = "extract finding on weakness in access controls from FY2018/19 to FY2020/21 AGO's report. Tabulate the output with row heading as Year of Report and details of findings."

# HYPERPARAMETERS ============================================================

# preprocessing --------------------------------------------------------------

# Chunk into sentences ('s') or paragraphs ('p')
chunking='s' 

# Group smaller chunks into a bigger chunk
grouping=1

# control minimum chubk size
min_chunk_size=100

# vector store ---------------------------------------------------------------

# add to data base in batches
batch_size = 1000

# Ranking --------------------------------------------------------------------

# top k matches for ranking. 
# Both sparse and dense search find top_k matches so hybrid search will return 
# at least top_k matches and most 2 * top_k matches
top_k = 30

# weights for each retrieval for reciprocal rank fusion
weights = [0.5, 0.5]

# reciprocal ranking fusion constant
k = 60

# Reranking ------------------------------------------------------------------

# top n matches for reranking
top_n = 20

# Cross encoder model
# claimed to be deprecated because it is bad but seems to still work fine
# model_name = "cross-encoder/stsb-roberta-base"

# best performing on Microsoft tests
model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"



# IMPROVEMENTS ===============================================================
HyDE = False
if HyDE:
    comments = "This is using HyDE"
else:
    comments = "None"

# pack parameters for tracing ================================================
params = (question, chunking, grouping, min_chunk_size, batch_size, top_k, 
          weights, k, top_n, model_name, HyDE, comments)

In [4]:
# RUN ONCE
# generate all required data structures

# generate chunks
generate_chunks(DOCUMENT_DIR,
                chunks_path,
                chunk_pageNum_pairs_path,
                s_p_pairs_path, 
                chunking, 
                grouping, 
                min_chunk_size,
                DOC_IDENTIFIER)


# generate inverted tree
has_content_page = True
generate_inverted_tree(chunk_pageNum_pairs_path, 
                       has_content_page, 
                       save_inverted_tree_path,
                       tree_path)

2.8488190174102783 seconds
number of chunks: 9127


In [5]:
# RUN ONCE
# retrieve all required data structures
inverted_tree = json_file_to_dict(save_inverted_tree_path)

# load chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of unique chunks:", len(chunks))

# prepare metadata for chromadb
pre_metadata = list(inverted_tree.values())
metadata = chroma_preprocess_metadata(pre_metadata)

# load sentence paragraph pairs
if (chunking == 's' or chunking == 'f') and grouping == 1:
    print("s_p_pairs will be filled")
    s_p_pairs = json_file_to_dict(s_p_pairs_path)
else:
    s_p_pairs = {}

Number of unique chunks: 8210
s_p_pairs will be filled


In [6]:
# create db
client_dense = chromadb.PersistentClient(path="../data/db", 
                                   settings = Settings(allow_reset=True))

In [7]:
# chromadb supported model
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

In [8]:
# chromadb's embedding function needs streaming
collection = chroma_get_or_create_collection(client_dense, 
                                             name = "audit", 
                                             embedding_function = openai_ef, 
                                             reset = True)

In [9]:
# fill db
chroma_fill_db(collection, chunks, metadata, batch_size)
print("number of embeddings in database:",collection.count())

number of embeddings in database: 8210


In [10]:
# RUN ONCE
# set up elasticseach for sparse embedding search

# create docker container for elasticsearch on terminal/shell
'''
docker run -p 127.0.0.1:9200:9200 -d --name elasticsearch --network elastic-net \
  -e ELASTIC_PASSWORD=$ELASTIC_PASSWORD \
  -e "discovery.type=single-node" \
  -e "xpack.security.http.ssl.enabled=false" \
  -e "xpack.license.self_generated.type=trial" \
  docker.elastic.co/elasticsearch/elasticsearch:8.14.3
'''

# connect to the Elasticsearch cluster from python elasticsearch client
client_sparce = Elasticsearch(
    LOCAL_HOST_URL,
    basic_auth=HTTP_AUTH
)
# checks if client is connected to docker container
print(client_sparce.info(http_auth=HTTP_AUTH))

# index chunks using elasticsearch (saved in docker)
index_elastic_db(client_sparce, index_name, HTTP_AUTH, chunks, reset = True)

{'name': '50cd72118574', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'l-puMSQDTvqL7nQDfYreOg', 'version': {'number': '8.14.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0', 'build_date': '2024-07-07T22:04:49.882652950Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
reset index: chromadb_documents


In [11]:
if HyDE:
    query = gpt4o(hyde(question), llm)

else:
    query = question

In [12]:

def sparce_retrieval(query):
    return bm25_elasticsearch(client_sparce, index_name, HTTP_AUTH, query, top_k)

def dense_retrieval(query):
    return chromadb_embedding_search(collection, query, top_k)

retrieval = RunnableParallel(
    {
        "sparce": sparce_retrieval,
        "dense": dense_retrieval,
        "query": RunnablePassthrough()
    }
)

# --------------------------------------------------------------------------

def rank(kwargs):
    bm25_results = kwargs["sparce"]
    embedding_results = kwargs["dense"]
    good_chunks = reciprocal_rank_fusion(bm25_results, 
                                  embedding_results, 
                                  weights, 
                                  k)
    return {
                "query": kwargs["query"], 
                "good chunks": good_chunks
           }

rank = RunnableLambda(rank)

# ---------------------------------------------------------------------------

def rerank(kwargs):
    query = kwargs["query"]
    good_chunks = kwargs["good chunks"]
    best_chunks, scores = reranking(model_name, good_chunks, query, top_n)
    return {
                "query": query, 
                "best chunks": best_chunks
           }

rerank = RunnableLambda(rerank)

# ---------------------------------------------------------------------------

def augment(kwargs):
    query = kwargs["query"]
    best_chunks = kwargs["best chunks"]
    prompt = generate_prompt(
        query, 
        inverted_tree, 
        best_chunks, 
        chunking, 
        s_p_pairs)
    
    return {
                "query": query,
                "prompt": prompt
           }

augment = RunnableLambda(augment)

# ---------------------------------------------------------------------------

def generate(kwargs):
    query = kwargs["query"]
    prompt = kwargs["prompt"]
    new_params = (query, ) + params
    response = rag_pipeline(new_params, prompt, llm)
    return response

generate = RunnableLambda(generate)

In [13]:
rag = retrieval | rank | rerank | augment | generate

response = rag.invoke(question)



ValueError: too many values to unpack (expected 12)