In [11]:
# langchain libraries
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# huggingface libraries
from sentence_transformers import CrossEncoder

# Add the parent directory (Auditbot_backend) to the system path
import sys
import os
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/inmemory_retriever.ipynb")
            , '..'
        )
    )
)

# constants
from utils.initialisations import OPENAI_API_KEY, s_p_pairs_path

# custom helper functions
from utils.json_parser import json_file_to_dict

# Other useful libraries
import numpy as np

In [3]:
# Load helper data_structures (page number : headings/sections)
it_path = "../data/parsed_documents/inverted_tree.json"
inverted_tree = json_file_to_dict(it_path)

# get chunks from tree's keys
chunks = list(inverted_tree.keys())
print("Number of chunks:", len(chunks))

query = "What are the findings pertaining to grant?"

Number of chunks: 8210


In [4]:
def langchain_docs_to_textlist(langchain_docs):
    """
    @param langchain_docs: list, collection of langchain document class objects
    @return textlist: list, text (str) obtained from langchain object
    """
    textlist = []
    for doc in langchain_docs:
        textlist.append(doc.page_content)
    
    return textlist

In [5]:
def ranking(chunks, query, top_k):
    chunk_list_1 = chunks
    chunk_list_2 = chunks.copy()
    
    # initialize the bm25 retriever
    bm25_retriever = BM25Retriever.from_texts(
        chunk_list_1
    )
    bm25_retriever.k = top_k

    # initialize the faiss retriever
    embedding = OpenAIEmbeddings(model="text-embedding-3-small",api_key = OPENAI_API_KEY)
    faiss_vectorstore = FAISS.from_texts(
        chunk_list_2, embedding
    )
    faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": top_k})

    # initialize the ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
    )

    good_langchain_docs = ensemble_retriever.invoke(query)
    good_chunks = langchain_docs_to_textlist(good_langchain_docs)

    return good_chunks, good_langchain_docs

In [6]:
good_chunks, good_langchain_docs = ranking(chunks, query, 5)

In [7]:
print(len(good_chunks))
for chunk, langchain_doc in zip(good_chunks, good_langchain_docs):
    print(chunk)
    print(langchain_doc)
    print("----------------------------------------------------------------")

10
Details of the lapses pertaining to the enforcement of SDL collections are in the 
 
following paragraphs
page_content='Details of the lapses pertaining to the enforcement of SDL collections are in the \n \nfollowing paragraphs'
----------------------------------------------------------------
Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b
page_content='Stage 1: Grant Design and Setup\n– whether processes were in place to ensure that grant programmes \nwere authorised and reviewed for relevance\nb'
----------------------------------------------------------------
Audit findings are conveyed by AGO to the ministries and statutory boards audited 
by way of “management letters”
page_content='Audit findings are conveyed by AGO to the ministries and statutory boards audited \nby way of “management letters”'
----------------------------------------------------------------
Stage 1: Grant Design a

In [8]:
def reranking(model_name, good_chunks, query, top_n = -1):
    model = CrossEncoder(model_name, max_length=512)
    queries = [query for _ in range(len(good_chunks))] 
    scores = model.predict(list(zip(queries, good_chunks)))
    print("scores:", scores)
    best_idxs = np.argsort(scores)[::-1]
    print("best idxs:", best_idxs)

    best_chunks = []
    for k, idx in enumerate(best_idxs):
        if k >= top_n:
            break
        best_chunks.append(good_chunks[idx])
    
    return best_chunks

In [9]:
best_chunks = reranking("cross-encoder/ms-marco-MiniLM-L-12-v2", good_chunks, query, 5)

for chunk in best_chunks:
    print(chunk)
    print("----------------------------------------------------------------")

scores: [0.10725074 0.2886162  0.1900847  0.27472395 0.18794931 0.34453192
 0.16118707 0.3218784  0.074872   0.34653497]
best idxs: [9 5 7 1 3 2 4 6 0 8]
Stage 2: Grant Evaluation and Approval
– whether there were processes and controls in place to ensure that 
grant applications were properly evaluated and approved
----------------------------------------------------------------
Stage 2: Grant Evaluation and Approval
–	
Whether there were processes and controls in place to ensure 
that grant cases were properly evaluated and approved
----------------------------------------------------------------
Application, evaluation and award of grants 
– whether the processes to invite, receive, evaluate and approve 
proposals and contract with grant recipients2 were properly administered
b
----------------------------------------------------------------
Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b

In [12]:
s_p_pair = json_file_to_dict(s_p_pairs_path)


In [13]:
for chunk in best_chunks:
    paragraph = s_p_pair[chunk]
    print(paragraph)
    print("-------------")

Stage 2: Grant Evaluation and Approval
– whether there were processes and controls in place to ensure that 
grant applications were properly evaluated and approved; and
– whether agreements with grant recipients were properly entered into
-------------
Stage 2: Grant Evaluation and Approval
–	
Whether there were processes and controls in place to ensure 
that grant cases were properly evaluated and approved; and
–	
Whether proper terms and conditions were stipulated for 
compliance
-------------
Application, evaluation and award of grants 
– whether the processes to invite, receive, evaluate and approve 
proposals and contract with grant recipients2 were properly administered
b
-------------
Stage 1: Grant Design and Setup
– whether processes were in place to ensure that grant programmes 
were authorised and reviewed for relevance
b
-------------
Stage 1: Grant Design and Setup
– whether there were processes and controls in place to ensure that 
grant programmes were authorised and adm

In [None]:
# in ensemble retriever, text chunks are taken as dictionary keys and the 
# metadata are the values. 

# If the same text chunk is used with different meta data, the first one loaded
# into dictionary is used as values. 

# This just means the metadata tagged to the chunks are useless to us as we use
# the same document for both retrievers. It is only useful if both retrievers 
# use different documents

In [14]:
# There is overlap 

doc_list_1 = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I like chimps",
    "I like sydney",
    "I like kangaroos",
    "I like Pi"
]

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
)
bm25_retriever.k = 2

embedding = OpenAIEmbeddings(model="text-embedding-3-small",api_key = OPENAI_API_KEY)
faiss_vectorstore = FAISS.from_texts(
    doc_list_1, embedding, metadatas=[{"source": 2}] * len(doc_list_1)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

docs = ensemble_retriever.invoke("apples")
docs

[Document(page_content='I like apples', metadata={'source': 1}),
 Document(page_content='I like Pi', metadata={'source': 1}),
 Document(page_content='Apples and oranges are fruits', metadata={'source': 2})]

In [15]:
# include all

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
)
bm25_retriever.k = 7

embedding = OpenAIEmbeddings(model="text-embedding-3-small",api_key = OPENAI_API_KEY)
faiss_vectorstore = FAISS.from_texts(
    doc_list_1, embedding, metadatas=[{"source": 2}] * len(doc_list_1)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 7})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

docs = ensemble_retriever.invoke("apples")
docs

[Document(page_content='I like apples', metadata={'source': 1}),
 Document(page_content='I like kangaroos', metadata={'source': 1}),
 Document(page_content='I like Pi', metadata={'source': 1}),
 Document(page_content='Apples and oranges are fruits', metadata={'source': 1}),
 Document(page_content='I like oranges', metadata={'source': 1}),
 Document(page_content='I like chimps', metadata={'source': 1}),
 Document(page_content='I like sydney', metadata={'source': 1})]

In [16]:
# switch order models are added to ensemble

# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_texts(
    doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
)
bm25_retriever.k = 7

embedding = OpenAIEmbeddings(model="text-embedding-3-small",api_key = OPENAI_API_KEY)
faiss_vectorstore = FAISS.from_texts(
    doc_list_1, embedding, metadatas=[{"source": 2}] * len(doc_list_1)
)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 7})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5]
)

docs = ensemble_retriever.invoke("apples")
docs

[Document(page_content='I like apples', metadata={'source': 2}),
 Document(page_content='I like kangaroos', metadata={'source': 2}),
 Document(page_content='Apples and oranges are fruits', metadata={'source': 2}),
 Document(page_content='I like Pi', metadata={'source': 2}),
 Document(page_content='I like oranges', metadata={'source': 2}),
 Document(page_content='I like chimps', metadata={'source': 2}),
 Document(page_content='I like sydney', metadata={'source': 2})]