In [None]:
import os
from typing import List, Tuple

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains import RetrievalQA

In [None]:
# Initialize Ollama embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Load and preprocess PDF document
loader = PyPDFLoader("path/to/your/document.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [None]:

# Create Chroma vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
# Initialize Ollama LLM
llm = Ollama(model="llama2")

In [None]:
# Dense Passage Retrieval
dense_retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

# BM25 (Sparse) Retrieval
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 2

In [None]:
# Hybrid Retrieval (combining dense and sparse methods)
ensemble_retriever = EnsembleRetriever(
    retrievers=[dense_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

In [None]:
# Re-ranking with LLM
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ensemble_retriever
)

In [None]:
# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=True
)

In [None]:
def process_query(query: str) -> Tuple[str, List[str]]:
    """
    Process a query using the RetrievalQA chain.
    
    Args:
        query (str): The input query.
    
    Returns:
        Tuple[str, List[str]]: The answer and a list of source documents.
    """
    result = qa_chain({"query": query})
    answer = result['result']
    source_documents = [doc.page_content for doc in result['source_documents']]
    return answer, source_documents

# Example usage
query = "What is the main topic of this PDF?"
answer, sources = process_query(query)
print(f"Query: {query}")
print(f"Answer: {answer}")
print("Sources:")
for i, source in enumerate(sources, 1):
    print(f"{i}. {source[:100]}...")  # Print first 100 characters of each source