In [11]:
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from aiolimiter import AsyncLimiter
from langchain.schema import Document
from openai import OpenAI
from collections import deque
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever
import time
import asyncio
import uuid
import os
load_dotenv()

True

In [2]:
laws_dir = "data/law/"       
procedures_dir = "data/procedures/"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="nlpaueb/legal-bert-base-uncased",model_kwargs={"device": "cuda"})
summariser = ChatOpenAI(model="gpt-4o-mini", temperature= 0,
                    max_tokens= 200,)
# summariser = ChatOpenAI(
#     model="meta-llama/Llama-3.1-8B-Instruct",
#     openai_api_base="https://mv8dkx45ioa5iuc9.us-east-1.aws.endpoints.huggingface.cloud/v1",
#     openai_api_key=os.environ["hf_token"],
#     temperature=0
# )

No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


In [5]:
def clean_text(text: str) -> str:
    # strip headers/footers and blank lines
    lines = [l for l in text.splitlines() if l.strip() and not l.lower().startswith('page')]
    return "\n".join(lines)

In [6]:

# store timestamps of last minute's requests
request_times = deque()
RPM_LIMIT = 490
WINDOW = 60   # seconds

async def rate_guard():
    now = time.time()
    # drop requests older than 60s
    while request_times and now - request_times[0] > WINDOW:
        request_times.popleft()
    if len(request_times) >= RPM_LIMIT:
        sleep_for = WINDOW - (now - request_times[0])
        print(f"RPM > {RPM_LIMIT}, sleeping {sleep_for:.0f}s")
        await asyncio.sleep(sleep_for)
        return await rate_guard()   # re-check after sleep
    # log this request
    request_times.append(now)

In [7]:
limiter = AsyncLimiter(3, 1) 

async def summarize_batch(chunks, batch_size=5):
    """
    Send 3 chunks at a time in a single prompt and get back structured summaries.
    """
    results = []
    # Break chunks into batches of batch_size
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        text = "\n\n---\n\n".join([c.page_content for c in batch])
        prompt = (
            "You are a helpful assistant designed to generate high-quality supporting context for a given chunk of text"
            "from a legal law pdf. This context will help a language model (LLM) better understand the chunk"
            "when it is passed separately from the pdf. For each section separated by '---', "
            "write a concise 1-2 sentence summary numbered 1, 2, 3...\n\n"
            f"{text}"
        )

        async with limiter:  # rate-limit outbound requests
            await rate_guard() 
            resp = await summariser.ainvoke(prompt)

        # split the model's response back onto the right chunks
        lines = resp.content.strip().split("\n")
        for idx, c in enumerate(batch):
            c.metadata["uuid"] = str(uuid.uuid4())
            # crude mapping: line order to chunk order
            c.metadata["summary"] = lines[idx] if idx < len(lines) else ""
            results.append(c)
    return results

In [8]:
pdf_semaphore = asyncio.Semaphore(3)
async def process_pdf(pdf_path, db_name):
    async with pdf_semaphore:
        print(f"Starting {pdf_path}")
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        docs = [Document(page_content=clean_text(d.page_content)) for d in docs]
        splitter = RecursiveCharacterTextSplitter(chunk_size=6000, chunk_overlap=500)
        chunks = splitter.split_documents(docs)
        for c in chunks:
            c.metadata["source_folder"] = os.path.basename(os.path.dirname(pdf_path))
        async with limiter:
            annotated = await summarize_batch(chunks, batch_size=3)
        print(f"Completed {pdf_path} with {len(chunks)} chunks batched.")
        return annotated

In [9]:
async def build_db_parallel(root_path: str, db_name: str):
    all_chunks = []
    tasks = []
    for root, _, files in os.walk(root_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                tasks.append(process_pdf(pdf_path, db_name))
    # Run up to 3 PDFs in parallel
    results = await asyncio.gather(*tasks)
    for annotated in results:
        all_chunks.extend(annotated)

    chroma_db = Chroma.from_documents(
        all_chunks,
        embeddings,
        collection_name=db_name,
        persist_directory=f"./{db_name}_chroma",
        collection_metadata={"hnsw:space": "cosine"}
    )
    return chroma_db

In [10]:
async def main():
    laws_task = build_db_parallel("data/law", "laws_db")
    proc_task = build_db_parallel("data/procedures", "procedures_db")
    laws_db, procedures_db = await asyncio.gather(laws_task, proc_task)
    print("Both vector stores ready.")
    return laws_db, procedures_db

laws_db, procedures_db = await main()

Starting data/law\vic_acts_laws_pdfs\02-15aa013-authorised.pdf
Starting data/law\vic_acts_laws_pdfs\03-69aa015-authorised.pdf
Starting data/law\vic_acts_laws_pdfs\03-81aa025-authorised.pdf
Completed data/law\vic_acts_laws_pdfs\02-15aa013-authorised.pdf with 84 chunks batched.
Starting data/law\vic_acts_laws_pdfs\04-107aa045-authorised.PDF
Completed data/law\vic_acts_laws_pdfs\03-81aa025-authorised.pdf with 101 chunks batched.
Starting data/law\vic_acts_laws_pdfs\04-15aa022-authorised.pdf
Completed data/law\vic_acts_laws_pdfs\03-69aa015-authorised.pdf with 118 chunks batched.
Starting data/law\vic_acts_laws_pdfs\04-16aa014-authorised.pdf
Completed data/law\vic_acts_laws_pdfs\04-15aa022-authorised.pdf with 44 chunks batched.
Starting data/law\vic_acts_laws_pdfs\05-49aa022-authorised.pdf
Completed data/law\vic_acts_laws_pdfs\04-16aa014-authorised.pdf with 61 chunks batched.
Starting data/law\vic_acts_laws_pdfs\06-15aa012%20authorised.pdf
Completed data/law\vic_acts_laws_pdfs\06-15aa012%20

In [None]:
chatgpt = ChatOpenAI(model="gpt-4o-mini", temperature=0)
similarity_retriever = laws_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})

#  LLMChainfilter decides which of the initially retrieved documents to filter out and which ones to return
# Retriever 2 - retrieves the documents similar to query and then applies the filter
_filter = LLMChainFilter.from_llm(llm=chatgpt)
compressor_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=similarity_retriever
)

# download an open-source reranker model - BAAI/bge-reranker-v2-m3
reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-large")
reranker_compressor = CrossEncoderReranker(model=reranker, top_n=3)

# Retriever 3 - Uses a Reranker model to rerank retrieval results from the previous retriever
final_retriever = ContextualCompressionRetriever(
    base_compressor=reranker_compressor, base_retriever=compressor_retriever
)

In [33]:
query = "What is the purpose of the Building and Construction Industry Security of Payment Act 2002?"
top_docs = final_retriever.invoke(query)

In [34]:
top_docs

[Document(metadata={'uuid': '9913ce61-2e56-415a-a6e6-e56a444261ca', 'summary': '2. The objective of these regulations is to establish forms necessary for the implementation of the Building and Construction Industry Security of Payment Act 2002.', 'source_folder': 'vic_Statutory_laws_pdfs'}, page_content='Authorised by the Chief Parliamentary Counsel \n1   \nAuthorised Version No. 001 \nBuilding and Construction Industry Security \nof Payment Regulations 2023 \nS.R. No. 34/2023 \nAuthorised Version as at \n27 May 2023 \n 1 Objective \nThe objective of these Regulations is to prescribe \nforms for the purposes of the Building and \nConstruction Industry Security of Payment \nAct 2002. \n 2 Authorising provision \nThese Regulations are made under section 52 of \nthe Building and Construction Industry \nSecurity of Payment Act 2002. \n 3 Commencement \nThese Regulations come into operation on 27 May \n2023. \n 4 Revocation \nThe Building and Construction Industry Security \nof Payment Regu