In [2]:
import os
from dotenv import load_dotenv

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser


In [5]:
load_dotenv()


DATA_PATH="/Users/somesh/Desktop/my_gen_ai_folder/Project1/Data"
DB_PATH = "vectorstore/db_faiss"

In [None]:


#SPTEP1 : Load the PDF files from the directory
def load_pdf(directory_path):
    loader = DirectoryLoader(
        path=directory_path,
        glob="*.pdf",
        loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

documents = load_pdf(DATA_PATH)
print(f"Total number of documents loaded: {len(documents)}") #759


#STEP2 : CREATE CHUNKS
def create_chunks(documents,chunk_size=500,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents)
    return chunks

chunks = create_chunks(documents)
print(f"Total number of chunks created: {len(chunks)}")  #7080

#STEP3 : CREATE EMBEDDING Model
def get_embedding_model():
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding_model

embedding_model = get_embedding_model()

#STEP4 : EMBED CHUNKS AND STORE THEM IN VECTOR STORE
def create_vector_store(chunks,embedding_model,DB_PATH):
    db = FAISS.from_documents(chunks, embedding_model)
    db.save_local(DB_PATH)
    
create_vector_store(chunks, embedding_model, DB_PATH)


Total number of documents loaded: 759
Total number of chunks created: 7080


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

#STEP3 : CREATE EMBEDDING Model
def get_embedding_model():
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding_model

embedding_model = get_embedding_model()

#STEP5 : LOAD LLM ,retriever and DATA BASE
def load_llm():
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.7,
        max_output_tokens=50,
        top_p=0.95,
        top_k=40,
        api_key=os.getenv("GOOGLE_API_KEY"),
    )
    return llm

def load_vector_store(DB_PATH):
    embedding_model = get_embedding_model()
    db = FAISS.load_local(DB_PATH, embedding_model,allow_dangerous_deserialization=True)
    return db

llm = load_llm()
db = load_vector_store(DB_PATH)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

  from .autonotebook import tqdm as notebook_tqdm


In [7]:

#STEP6: CREATE PROMPT TEMPLATE
template = """
Use the pieces of information provided in the context to answer user's question.
If you dont know the answer, just say that you dont know, dont try to make up an answer. 
Dont provide anything out of the given context

Context: {context}
Question: {question}

Start the answer directly. No small talk please.
"""

def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

parallel_chain.invoke('who is Demis')

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

parser = StrOutputParser()

main_chain = parallel_chain | prompt | llm | parser


def chatbot():
    query = input("Ask a question: ")
    result = main_chain.invoke(query)
    print(result)
    
chatbot()

NameError: name 'retriever' is not defined