In [None]:
#pip install openai==0.28
#pip install -U langchain-community
#pip install pypdf 
#pip install chromadb
#pip install rank_bm25

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, DocArrayInMemorySearch
from langchain.retrievers import BM25Retriever  
from langchain.retrievers.ensemble import EnsembleRetriever

In [None]:
# Load the PDF document
loader = PyPDFLoader("/Users/pradhikshasuresh/Documents/Python/Space.pdf")
documents = loader.load()

# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)

# Create embeddings
embeddings = OpenAIEmbeddings(openai_api_key='MY_OPENAI_KEY')

# Create a vector store with Chroma
db = Chroma.from_documents(docs, embeddings)

In [None]:
# Create a similarity retriever
retriever_similarity = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Create a BM25 retriever
retriever_bm25 = BM25Retriever.from_documents(docs)

# Initialize the Ensemble Retriever with BM25 and similarity retrievers
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_similarity, retriever_bm25])

In [None]:
# Initialize the LLM model
llm_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key='MY_OPENAI_KEY')

# Create QA chain for similarity retriever
qa_similarity = ConversationalRetrievalChain.from_llm(
    llm=llm_model,
    retriever=retriever_similarity,
)

# Create QA chain for BM25 retriever
qa_bm25 = ConversationalRetrievalChain.from_llm(
    llm=llm_model,
    retriever=retriever_bm25,
)

# Create QA chain for ensemble retriever
qa_ensemble = ConversationalRetrievalChain.from_llm(
    llm=llm_model,
    retriever=ensemble_retriever,
    chain_type="stuff",
)

# Define a question for comparison
question1 = "Explain the concept of vacuum?"
question2="What is the width of the milkyway galaxy?"
question3="how much oxygen does the air have?"
question4="Briefly explain what are charged particles and their 3 primary sources"


chat_history = []

# Get answers from the similarity retriever
result_similarity1 = qa_similarity.invoke({"question": question1, "chat_history": chat_history})
result_similarity2 = qa_similarity.invoke({"question": question2, "chat_history": chat_history})
result_similarity3 = qa_similarity.invoke({"question": question3, "chat_history": chat_history})
result_similarity4 = qa_similarity.invoke({"question": question4, "chat_history": chat_history})

# Get answers from the BM25 retriever
result_bm251 = qa_bm25.invoke({"question": question1, "chat_history": chat_history})
result_bm252 = qa_bm25.invoke({"question": question2, "chat_history": chat_history})
result_bm253 = qa_bm25.invoke({"question": question3, "chat_history": chat_history})
result_bm254 = qa_bm25.invoke({"question": question4, "chat_history": chat_history})

# Get answers from the ensemble retriever
result_ensemble1 = qa_ensemble({"question": question1, "chat_history": chat_history})
result_ensemble2 = qa_ensemble({"question": question2, "chat_history": chat_history})
result_ensemble3 = qa_ensemble({"question": question3, "chat_history": chat_history})
result_ensemble4 = qa_ensemble({"question": question4, "chat_history": chat_history})

In [None]:
#Print the results for comparison

print("Similarity Retriever Result 1:")
print(result_similarity1)
print("\nBM25 Retriever Result 1:")
print(result_bm251)
print("\nEnsemble Result 1:")
print(result_ensemble1)

print("\nSimilarity Retriever Result 1:")
print(result_similarity2)
print("\nBM25 Retriever Result 2:")
print(result_bm252)
print("\nEnsemble Result 2:")
print(result_ensemble2)

print("\nSimilarity Retriever Result 3:")
print(result_similarity3)
print("\nBM25 Retriever Result 3:")
print(result_bm253)
print("\nEnsemble Result 3:")
print(result_ensemble3)

print("\nSimilarity Retriever Result 4:")
print(result_similarity4)
print("\nBM25 Retriever Result 4:")
print(result_bm254)
print("\nEnsemble Result 4:")
print(result_ensemble4)