# Ingesting PDFs

In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

In [2]:
# local pdf file upload
path = "C:/Users/omthe/brain/Desktop/RAG_based_Chatbot/empsit.pdf"

if path:
    loader = UnstructuredPDFLoader(file_path=path)
    data = loader.load()
else:
    print("Upload a PDF file")

In [3]:
# preview first page
data[0].page_content

"International Journal of Scientific Reports Ogunlayi AC et al. Int J Sci Rep. 2024 Jun;10(6):188-194 http://www.sci-rep.com\n\npISSN 2454-2156 | eISSN 2454-2164\n\nOriginal Research Article\n\nDOI: https://dx.doi.org/10.18203/issn.2454-2156.IntJSciRep20241315\n\nAssessing the diagnostic impact of P63, PSA and BCL-2 proteins in premalignant and malignant prostate tissues\n\nAderonke C. Ogunlayi1, Victor O. Ekundina1, Adedapo O. Kehinde2*,\n\nLinus A. Enye3, Adegoke O. Aremu1\n\n1Department of Medical Laboratory Science, College of Medicine and Health Sciences, Afe Babalola University, Ado- Ekiti, Ekiti State, Nigeria 2Department of Medical Laboratory Science, College of Basic Medical Sciences, Achievers University, Idasen-owo, Ondo State, Nigeria 3Department of Anatomy Science, College of Medicine and Health Sciences, Afe Babalola University, Ado-Ekiti, Ekiti State, Nigeria\n\nReceived: 05 March 2024 Revised: 12 April 2024 Accepted: 16 April 2024\n\nCorrespondence: Dr. Adedapo O. Kehin

# Vector Embeddings

In [4]:
from langchain_community.embeddings import HuggingFaceHubEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [5]:
# split and chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7000, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [7]:
# add vector db
vector_db = Chroma.from_documents(documents=chunks,
                                 embedding=HuggingFaceHubEmbeddings(huggingfacehub_api_token=HF_API),
                                 collection_name="RAG")

# Retrieval

In [8]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import GoogleGenerativeAI
from langchain_core. runnables import RunnablePassthrough
from langchain. retrievers.multi_query import MultiQueryRetriever

In [9]:
# LLM from Google
llm = GoogleGenerativeAI(model="models/text-bison-001", google_api_key=GOOGLE_API)

In [10]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from
a vector database. By generating multiple perspectives on the user question, your
goal is to help the user overcome some of the limitations of the distance-based
similarity search. Provide these alternative questions separated by newlines.
Original question: {question}""",
)

In [11]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=QUERY_PROMPT
)

# RAG Prompt
template = """Answer the following question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [12]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Test

In [13]:
# Delete all Colletion in the db
# vector_db.delete_collection()

In [14]:
chain.invoke("summary of the document like I'm five year old")

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RetryError: Timeout of 60.0s exceeded, last exception: 503 failed to connect to all addresses; last error: UNKNOWN: ipv4:142.250.70.42:443: tcp handshaker shutdown.
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RetryError: Timeout of 60.0s exceeded, last exception: 503 failed to connect to all addresses; last error: UNKNOWN: ipv4:142.250.199.170:443: tcp handshaker shutdown.
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RetryError: Timeout of 60.0s exceeded, last exception: 503 failed to connect to all addresses; last error: UNKNOWN: ipv4:142.250.70.42:443: socket is null.
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised RetryError: Timeout of 60.0s exc

RetryError: Timeout of 60.0s exceeded, last exception: 503 failed to connect to all addresses; last error: UNKNOWN: ipv4:142.251.42.106:443: tcp handshaker shutdown