In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader, TextLoader, CSVLoader
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.storage import LocalFileStore
from langchain.retrievers import BM25Retriever, EnsembleRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter, EmbeddingsFilter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import DeepInfra
from langchain.chains import ConversationalRetrievalChain
from langchain_core.utils.env import get_from_env
from dotenv import load_dotenv
import streamlit as st
from streamlit_chat import message
import tempfile
import os

In [None]:

# Accessing the OPENAI_API_KEY variable
openaiapikey = os.environ.get('OPENAI_API_KEY')

# Accessing the DEEPINFRA_API_TOKEN variable
deeptoken = os.environ.get('DEEPINFRA_API_TOKEN')

In [None]:
compressor_llm = DeepInfra(model_id="mistralai/Mistral-7B-Instruct-v0.1")
compressor_llm.model_kwargs = {
    "temperature": 0.1,
    "repetition_penalty": 1.2,
    "max_new_tokens": 500,
    "top_p": 0.90,
}

In [None]:
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
# Helper function for printing docs
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [None]:
loader = PyPDFDirectoryLoader("Documents")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap  = 300,
    length_function = len,
    is_separator_regex = False,
)
documents  = text_splitter.split_documents(docs)


In [None]:
%%time
# Define the path for the FAISS vector store
FAISS_DB_PATH = 'vectorStore/cse/semester8'


if os.path.exists(FAISS_DB_PATH):
   db = FAISS.load_local(FAISS_DB_PATH, embeddings)
   print(f"vectorDB already present in dir : {FAISS_DB_PATH}")

else:
    FAISS_DB_PATH = 'vectorStore/cse/semester8'
    db = FAISS.from_documents(documents, embeddings)
    db.save_local(FAISS_DB_PATH)


    print(f"vectorDB created in dir : {FAISS_DB_PATH}")


In [None]:
faiss_retriever = db.as_retriever(search_kwargs={"k": 4})
" k   define no of top relevant documents to be retrieved"

bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 4


# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)

In [None]:
user_query =  '''
 List out the key elements or components involved in parallel computing, and how do these elements work together to execute tasks simultaneously and improve the overall performance and efficiency of computing systems?
''' ## make this user query to inout field in streamlit application to get user query

docs = ensemble_retriever.get_relevant_documents(user_query)


In [None]:

if (input("enter quality mode : high or average")=='high'):   ##this must be made into a mode button in the streamlit side bar (retrieval processing -> quality or standard)
    compressor = LLMChainExtractor.from_llm(compressor_llm)
else:
    compressor = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.80)
    print("using similarity_threshold")
    
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=ensemble_retriever)

In [None]:
compressed_docs = compression_retriever.get_relevant_documents(user_query)
pretty_print_docs(compressed_docs)

In [None]:
metadata_info=[]
for i in range(len(compressed_docs)):    
    metadata_info.append('\''+str(compressed_docs[i].metadata)[23:])

In [None]:
metadata_info

In [None]:
llm = DeepInfra(model_id="meta-llama/Llama-2-7b-chat-hf")
llm.model_kwargs = {
    "temperature": 0.0,
    "repetition_penalty": 1.2,
    "max_new_tokens": 512,
    "top_p": 0.9,
}


In [None]:
context = "\n".join([f"{doc.page_content}\nMetadata: {doc.metadata}" for doc in compressed_docs])
response = llm( context=context,prompt=user_query)

In [None]:
response