In [1]:
import os
from typing import List, Tuple

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains import RetrievalQA

In [2]:
# Initialize Ollama embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Load and preprocess PDF document
loader = PyPDFLoader("OSHA -Module 1.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [3]:
# Create Chroma vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
# Initialize Ollama LLM
llm = Ollama(model="mistral:instruct")

In [4]:
# Dense Passage Retrieval
dense_retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# BM25 (Sparse) Retrieval
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 2

In [5]:
# Hybrid Retrieval (combining dense and sparse methods)
ensemble_retriever = EnsembleRetriever(
    retrievers=[dense_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

In [6]:
# Re-ranking with LLM
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ensemble_retriever
)

In [7]:
# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=True
)

In [8]:

def print_retrieval_results(documents: List[str], method: str):
    print(f"\n{method} Retrieval Results:")
    for i, doc in enumerate(documents, 1):
        print(f"{i}. {doc[:100]}...")  # Print first 100 characters of each document


In [9]:
def process_query(query: str) -> Tuple[str, List[str]]:
    """
    Process a query using the RetrievalQA chain and print intermediate retrieval results.
    
    Args:
        query (str): The input query.
    
    Returns:
        Tuple[str, List[str]]: The answer and a list of source documents.
    """
    # Dense retrieval
    dense_docs = dense_retriever.get_relevant_documents(query)
    print_retrieval_results([doc.page_content for doc in dense_docs], "Dense")

    # BM25 retrieval
    bm25_docs = bm25_retriever.get_relevant_documents(query)
    print_retrieval_results([doc.page_content for doc in bm25_docs], "BM25 (Sparse)")

    # Hybrid retrieval
    hybrid_docs = ensemble_retriever.get_relevant_documents(query)
    print_retrieval_results([doc.page_content for doc in hybrid_docs], "Hybrid")

    # Re-ranking
    reranked_docs = compression_retriever.get_relevant_documents(query)
    print_retrieval_results([doc.page_content for doc in reranked_docs], "Re-ranked")

    # Final QA
    result = qa_chain({"query": query})
    answer = result['result']
    source_documents = [doc.page_content for doc in result['source_documents']]
    
    return answer, source_documents

# Example usage
query = "What is OSHA? When this term come into picture.What are the majour incidents that laed to implimentation of workers safety?"
answer, sources = process_query(query)
print(f"Query: {query}")
print(f"Answer: {answer}")
print("Sources:")
for i, source in enumerate(sources, 1):
    print(f"{i}. {source[:100]}...")  # Print first 100 characters of each source

  dense_docs = dense_retriever.get_relevant_documents(query)



Dense Retrieval Results:
1. OSHA’S MISSION AND PURPOSE
•Encourage employers andemployees toreduce workplace hazards .
•Implement...
2. WORK INJURIES BY TYPE OF ACCIDENT
Work injuries can be classified by the type of accident from which...

BM25 (Sparse) Retrieval Results:
1. When death rates are computed on the basis of the number of 
deaths per 100,000 workers in a given y...
2. •Theenvironment inwhich themachine operator is
working isunusually hectic ,and thepressure to
comple...

Hybrid Retrieval Results:
1. OSHA’S MISSION AND PURPOSE
•Encourage employers andemployees toreduce workplace hazards .
•Implement...
2. When death rates are computed on the basis of the number of 
deaths per 100,000 workers in a given y...
3. WORK INJURIES BY TYPE OF ACCIDENT
Work injuries can be classified by the type of accident from which...
4. •Theenvironment inwhich themachine operator is
working isunusually hectic ,and thepressure to
comple...

Re-ranked Retrieval Results:
1. 1. OSHA’S MISSION AND PU

  result = qa_chain({"query": query})


Query: What is OSHA? When this term come into picture.What are the majour incidents that laed to implimentation of workers safety?
Answer:  OSHA (Occupational Safety and Health Administration) is a U.S. government agency responsible for safety and health program regulation in most U.S. workplaces. It was established by the Occupational Safety and Health Act of 1970, primarily due to a series of major incidents involving high death rates in various industries such as Mining/quarrying, Agriculture, Construction, Transportation/public utilities, Manufacturing, Services, Trade.

The major incidents that led to the implementation of worker safety included accidents like overexertion, impact accidents, falls, bodily reactions (to chemicals), compression, motor vehicle accidents, exposure to radiation or caustics, rubbing or abrasions, and exposure to extreme temperatures. These accidents often occurred due to factors such as a hectic work environment, pressure to complete tasks on time, and 