# Ingesting PDFs

In [None]:
!pip install unstructured langchain
!pip install "unstructured[all-docs]"

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader

In [3]:
# local pdf file upload
path = "empsit.pdf"

if path:
    loader = UnstructuredPDFLoader(file_path=path)
    data = loader.load()
else:
    print("Upload a PDF file")

In [4]:
# preview first page
data[0].page_content

"International Journal of Scientific Reports Ogunlayi AC et al. Int J Sci Rep. 2024 Jun;10(6):188-194 http://www.sci-rep.com\n\npISSN 2454-2156 | eISSN 2454-2164\n\nOriginal Research Article\n\nDOI: https://dx.doi.org/10.18203/issn.2454-2156.IntJSciRep20241315\n\nAssessing the diagnostic impact of P63, PSA and BCL-2 proteins in premalignant and malignant prostate tissues\n\nAderonke C. Ogunlayi1, Victor O. Ekundina1, Adedapo O. Kehinde2*,\n\nLinus A. Enye3, Adegoke O. Aremu1\n\n1Department of Medical Laboratory Science, College of Medicine and Health Sciences, Afe Babalola University, Ado- Ekiti, Ekiti State, Nigeria 2Department of Medical Laboratory Science, College of Basic Medical Sciences, Achievers University, Idasen-owo, Ondo State, Nigeria 3Department of Anatomy Science, College of Medicine and Health Sciences, Afe Babalola University, Ado-Ekiti, Ekiti State, Nigeria\n\nReceived: 05 March 2024 Revised: 12 April 2024 Accepted: 16 April 2024\n\nCorrespondence: Dr. Adedapo O. Kehin

# Vector Embeddings

In [5]:
!ollama pull nomic-embed-text 

[?25lpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest â § [?25h[?25l[2K[1Gpulling manifest â ‡ [?25h[?25l[2K[1Gpulling manifest â � [?25h[?25l[2K[1Gpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest â ‡ [?25h[?25l[2K[1Gpulling manifest â � [?25h[?25l[2K[1Gpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling ma

In [6]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED               
nomic-embed-text:latest	0a109f422b47	274 MB	Less than a second ago	
gemma:2b               	b50d6c999e59	1.7 GB	3 hours ago           	


In [7]:
!pip install chromadb langchain-text-splitters

Defaulting to user installation because normal site-packages is not writeable


In [8]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [18]:
# split and chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [10]:
# add vector db
vector_db = Chroma.from_documents(documents=chunks,
                                 embedding=OllamaEmbeddings(model="nomic-embed-text:latest", show_progress=True),
                                 collection_name="local-rag")

OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 5/5 [00:58<00:00, 11.72s/it]


# Retrieval

In [11]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core. runnables import RunnablePassthrough
from langchain. retrievers.multi_query import MultiQueryRetriever

In [12]:
# LLM from ollama
local_model = "gemma:2b"
llm = ChatOllama(model=local_model)

In [13]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from
a vector database. By generating multiple perspectives on the user question, your
goal is to help the user overcome some of the limitations of the distance-based
similarity search. Provide these alternative questions separated by newlines.
Original question: {question}""",
)

In [14]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=QUERY_PROMPT
)

# RAG Prompt
template = """Answer the following question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


In [15]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Test

In [17]:
chain.invoke(input(""))

 WHat is document about?


OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.89s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.05s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.12s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|█████████████████

'This document is about an IHC analysis of prostate tissues with the aim of studying the expression of PSA, p63 and BCL-2 proteins in benign prostate hyperplasia (BPH), CaP and normal prostatic epithelium. The study aims to understand the relationship between the expression of these proteins and the development of prostate cancer.'

In [20]:
chain.invoke(input(""))

 limitaions of the research


OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.79s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.15s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.10s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.07s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.17s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.06s/it]
OllamaEmbeddings: 100%|█████████████████

'Sure, here are the limitations of the research:\n\n* The study was retrospective, which means that the data was collected from existing patient files and records. This limits the generalizability of the results to a population of all prostate cancer patients.\n\n\n* The study was conducted on a relatively small number of patients (n=80). This can make it difficult to draw conclusions about the efficacy of the biomarkers in a larger population.\n\n\n* The study was conducted in a single center, which could limit the generalizability of the results to other medical centers.\n\n\n* The study did not use a control group of patients with benign prostate hyperplasia (BPH) to compare the diagnostic accuracy of the biomarkers. This makes it difficult to determine the true diagnostic value of the biomarkers.\n\n\n* The study did not use a variety of biomarkers to assess their diagnostic accuracy. This limits the generalizability of the results to other biomarkers.'

In [None]:
# Delete all Colletion in the db
# vector_db.delete_collection()