In [None]:
%pwd

In [2]:
# Change the dir to main to prevent path issue
import os
os.chdir("../")

In [None]:
# Root Project Directory
%pwd

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",  # Load Only Pdf documents
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [None]:
# Load pdf from the data folder
extracted_data=load_pdf_file(data='Data/')
extracted_data

In [None]:
print("Length of Book Pages", len(extracted_data))

In [8]:
# Chunking Operation
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

In [10]:
# Load The Google Api key
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')

In [None]:
# Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings # Embedding Model

# Load The Google Api key
from dotenv import load_dotenv
load_dotenv() 

# Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector=embeddings.embed_documents("text_chunks")

In [None]:
# Determine the dimension
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

In [13]:
# Import necessary libraries
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Define the path to the FAISS database
DB_FAISS_PATH = "VectorStore/db_faiss"

# Initialize the embeddings model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Assuming text_chunks is a list of text documents
# Create a FAISS index from the text chunks and embeddings
db = FAISS.from_documents(text_chunks, embeddings)

# Save the FAISS index to a local file
db.save_local(DB_FAISS_PATH)

# Load the FAISS index from the local file
loaded_db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)

In [None]:
loaded_db

In [15]:
retriever = loaded_db.as_retriever(search_type="similarity", search_kwargs={"k":3}) # For showing 3 results

In [16]:
retrieved_docs = retriever.invoke("What is Water Mammals?")

In [None]:
retrieved_docs

In [18]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-002",temperature=0.3, max_tokens=500)

In [19]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer in bullet format"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "What is Water Mammals?"})
print(response["answer"])