In [None]:
#Importing necessary python modules
import os 
from langchain import PromptTemplate, LLMChain
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS

In [None]:
#Defining the Model Path
model_path = 'models/llama-2-7b-chat.Q4_K_M.gguf' 


#Setting up Callbacks
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

#Utilising the sentence transformer module from Hugging Face to generate embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

#Initializing the LLM object
llm = LlamaCpp(model_path=model_path, callback_manager=callback_manager, verbose=True,n_ctx=2048, n_threads=6, n_batch=2048)

In [None]:
def text_splitter(sources):
    chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=32)
    for chunk in splitter.split_documents(sources):
        chunks.append(chunk)
    return chunks

In [None]:
def create_faiss_index(chunks):
    texts = [doc.page_content for doc in chunks]
    meta_data = [doc.metadata for doc in chunks]

    index = FAISS.from_texts(texts, embeddings, metadatas=meta_data)

    return index

In [None]:
local_data_path = "local-data"
#This code is only for supporting PDF files but LangChain has support for multiple file types
file_list = [filename for filename in os.listdir(local_data_path) if filename.endswith('.pdf')]
len_docs = len(file_list)

In [None]:
#Creating the Index for the first document in File List
loader = PyPDFLoader(os.path.join(local_data_path, file_list[0]))
docs = loader.load()
chunks = text_splitter(docs)
vectordb0 = create_faiss_index(chunks)

In [None]:
#Creating the index for all other documents in the list and merging it with the first vector index
for i in range(1,len_docs):

    loader = PyPDFLoader(os.path.join(local_data_path, file_list[i]))
    docs = loader.load()
    chunks = text_splitter(docs)
    vectordb_i = create_faiss_index(chunks)
    
    vectordb0.merge_from(vectordb_i)

vectordb0.save_local("Faiss_Vector_DB")

In [None]:
#Loading the previously saved Vector Database
index = FAISS.load_local("Faiss_Vector_DB", embeddings)

#Initializing the retriever, Retrievers are responsible for fetching the most relevant context to given a user query
retriever = index.as_retriever()

# Create a sample prompt template
template = """ You are a chatbot answering questions.
Please use the following context to answer your questions.
Context: {context}
---
Question: {question}
Answer: """

In [None]:
# Question
question = "Insert Your Question Here"

In [None]:

#Querying the vector DB using the retriever
docs = retriever.get_relevant_documents(question)
# Creating the context for the LLM Chain
context = "\n".join([doc.page_content for doc in docs])
# Initializing the Prompt Template and supplying the context
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)
# Initializing the LLM Chain with the prompt template with the LLM
llm_chain = LLMChain(prompt=prompt, llm=llm)
#Run the Chain and Print a Response
llm_chain.run(question)
