Chat My Pdf: Rag with Langchain, Ollama and FAISS Vector Db

In [1]:
# install required library
# pip install -U langchain-community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv

In [2]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' # to avoid problem of two db present
warnings.filterwarnings('ignore')
load_dotenv()

True

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("1. dietary supplements - for whom.pdf")
docs = loader.load()

In [4]:
doc = docs[0]

In [5]:
doc.metadata

{'source': '1. dietary supplements - for whom.pdf',
 'file_path': '1. dietary supplements - for whom.pdf',
 'page': 0,
 'total_pages': 17,
 'format': 'PDF 1.7',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': '',
 'producer': 'iLovePDF',
 'creationDate': '',
 'modDate': 'D:20241021113754Z',
 'trapped': ''}

In [6]:
print(doc.page_content)

International  Journal  of
Environmental Research
and Public Health
Review
Dietary Supplements—For Whom? The Current State of
Knowledge about the Health Effects of Selected
Supplement Use
Regina Ewa Wierzejska


Citation: Wierzejska, R.E. Dietary
Supplements—For Whom? The
Current State of Knowledge about the
Health Effects of Selected Supplement
Use. Int. J. Environ. Res. Public Health
2021, 18, 8897. https://doi.org/
10.3390/ijerph18178897
Academic Editor: Paul B. Tchounwou
Received: 15 July 2021
Accepted: 21 August 2021
Published: 24 August 2021
Publisher’s Note: MDPI stays neutral
with regard to jurisdictional claims in
published maps and institutional afﬁl-
iations.
Copyright: © 2021 by the author.
Licensee MDPI, Basel, Switzerland.
This article is an open access article
distributed
under
the
terms
and
conditions of the Creative Commons
Attribution (CC BY) license (https://
creativecommons.org/licenses/by/
4.0/).
Department of Nutrition and Nutritional Value of Food, 

In [7]:
len(docs)

17

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = text_splitter.split_documents(docs)

In [9]:
print(chunks[0].page_content)

International  Journal  of
Environmental Research
and Public Health
Review
Dietary Supplements—For Whom? The Current State of
Knowledge about the Health Effects of Selected
Supplement Use
Regina Ewa Wierzejska


Citation: Wierzejska, R.E. Dietary
Supplements—For Whom? The
Current State of Knowledge about the
Health Effects of Selected Supplement
Use. Int. J. Environ. Res. Public Health
2021, 18, 8897. https://doi.org/
10.3390/ijerph18178897
Academic Editor: Paul B. Tchounwou
Received: 15 July 2021
Accepted: 21 August 2021
Published: 24 August 2021
Publisher’s Note: MDPI stays neutral
with regard to jurisdictional claims in
published maps and institutional afﬁl-
iations.
Copyright: © 2021 by the author.
Licensee MDPI, Basel, Switzerland.
This article is an open access article
distributed
under
the
terms
and
conditions of the Creative Commons
Attribution (CC BY) license (https://
creativecommons.org/licenses/by/
4.0/).


In [10]:
print(type(chunks))

<class 'list'>


In [11]:
len(docs[0].page_content),len(chunks[0].page_content)

(4491, 946)

In [12]:
# to know the token
import tiktoken 

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(docs[0].page_content)),len(encoding.encode(chunks[0].page_content))

(1017, 271)

In [13]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


In [14]:
embeddings = OllamaEmbeddings(model='nomic-embed-text',base_url="http://localhost:11434")

single_vector = embeddings.embed_query("this is a text")


In [15]:
len(single_vector)

768

In [16]:
index = faiss.IndexFlatL2(len(single_vector))

In [17]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001F52D32F870> >

In [18]:
vector_store = FAISS(
    embedding_function = embeddings,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id = {}
)

In [19]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1f52f77ff10>

In [20]:
chunks[0]

Document(metadata={'source': '1. dietary supplements - for whom.pdf', 'file_path': '1. dietary supplements - for whom.pdf', 'page': 0, 'total_pages': 17, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'iLovePDF', 'creationDate': '', 'modDate': 'D:20241021113754Z', 'trapped': ''}, page_content='International  Journal  of\nEnvironmental Research\nand Public Health\nReview\nDietary Supplements—For Whom? The Current State of\nKnowledge about the Health Effects of Selected\nSupplement Use\nRegina Ewa Wierzejska\n\x01\x02\x03\x01\x04\x05\x06\x07\x08\n\x01\x02\x03\x04\x05\x06\x07\nCitation: Wierzejska, R.E. Dietary\nSupplements—For Whom? The\nCurrent State of Knowledge about the\nHealth Effects of Selected Supplement\nUse. Int. J. Environ. Res. Public Health\n2021, 18, 8897. https://doi.org/\n10.3390/ijerph18178897\nAcademic Editor: Paul B. Tchounwou\nReceived: 15 July 2021\nAccepted: 21 August 2021\nPublished: 24 August 2021\nPublish

In [21]:
type(chunks)

list

In [22]:
ids = vector_store.add_documents(documents = chunks)

In [23]:
ids

['6579d62d-4cbb-4078-97cf-8adef42c5aa2',
 '1456b323-43dd-43e9-bdf4-b49bd9c1a72c',
 'e1a5c2d3-2fcf-4b60-9bac-54f659dfb17c',
 '4677d32e-e823-4832-b032-6553f5376dbb',
 '03d8e4d6-dca4-41dc-a5c4-8b4198a03ec6',
 '0a63ab4c-aede-4861-818d-c94c6cd09465',
 'f4a569ce-3a95-4726-a0d6-8c08ade69e0c',
 '1b777148-7ab6-465d-8f1b-b76c5d3857e2',
 'f5288db1-9987-4371-be21-b51af0e13279',
 '2533ae10-43dd-4278-b405-637cb5332f0a',
 'c5829073-507e-4c39-8b58-2a043ddd5e71',
 'bb84ab5f-7eef-4c05-b16a-74336f4473a8',
 '8be1dcc5-23ee-4aa7-a487-60c2265d47bc',
 '992bd3f2-23b1-453b-a0b6-18ebe95d4612',
 '2684a285-be9a-4330-ac1f-37a4c422a9ab',
 'c80c9140-b422-42ac-b9df-5dd0ad0df421',
 '8f716614-03be-4c26-b6e2-5d8f270ba182',
 'a74d17ac-fdc9-41e8-96fb-57dd168d7b56',
 'b0d290d3-b555-49d5-8439-4882152a14aa',
 '9185bc71-695e-460c-8b01-35d33226bebf',
 '56344815-e069-4d50-9a10-4d418577bf68',
 'a7ebfe1a-8796-40e9-b3b8-ca866d03dc29',
 'd15215c6-4b12-43bd-8733-5806a52f8c12',
 '466c17a2-c2a7-4032-bb85-3afc304fce19',
 '3b06f8f8-a99a-

### Retrieval

In [26]:
question = "what are the dietary supplements?"
docs = vector_store.search(query=question, search_type = 'similarity')

for doc in docs:
    print(doc.page_content)
    print("\n\n")

supplements mean products that are concentrated sources of vitamins, minerals, or other
substances with a nutritional or physiological effect (e.g., amino acids, essential fatty acids,
probiotics, plants, and herbal extracts) intended to supplement the regular diet. Dietary
supplements are produced in the form of capsules, tablets, pills, and other similar forms,
designed to be taken in measured small unit quantities [1,2]. Dietary supplements, despite
their route of administration and drug-like appearance, have been classiﬁed as foodstuffs
and not medicines. Thus, in formal terms, supplement users are consumers rather than
patients, but the question remains whether it is sick or healthy individuals who should
be the primary users. The market for dietary supplements continues to expand at a rapid
pace, and manufacturers develop products for health problems affecting almost all organs
of the body, as well as for non-existing conditions, e.g., supplements for bladder elasticity.



Attri

In [28]:
retriver = vector_store.as_retriever(search_type="mmr",search_kwargs = {'k':3,'fetch_k':100,'lambda_mult':1})

In [30]:
docs = retriver.invoke(question)

for doc in docs:
    print(doc.page_content)
    print("\n\n\n")

supplements mean products that are concentrated sources of vitamins, minerals, or other
substances with a nutritional or physiological effect (e.g., amino acids, essential fatty acids,
probiotics, plants, and herbal extracts) intended to supplement the regular diet. Dietary
supplements are produced in the form of capsules, tablets, pills, and other similar forms,
designed to be taken in measured small unit quantities [1,2]. Dietary supplements, despite
their route of administration and drug-like appearance, have been classiﬁed as foodstuffs
and not medicines. Thus, in formal terms, supplement users are consumers rather than
patients, but the question remains whether it is sick or healthy individuals who should
be the primary users. The market for dietary supplements continues to expand at a rapid
pace, and manufacturers develop products for health problems affecting almost all organs
of the body, as well as for non-existing conditions, e.g., supplements for bladder elasticity.




Attr

### RAG with llama3.2 on Ollama

In [33]:
from langchain import hub   # to import rag related prompt
from langchain_core.output_parsers import StrOutputParser # to get final output as string data
from langchain_core.runnables import RunnablePassthrough # to directly pass question to llm
from langchain_core.prompts import ChatPromptTemplate  # for prompt data

from langchain_ollama import ChatOllama # make connection b/w langchain and llama3.2

In [35]:
model = ChatOllama(model="llama3.2:1b", base_url="http://localhost:11434")

model.invoke("hii")

AIMessage(content='Hello. How can I assist you today?', additional_kwargs={}, response_metadata={'model': 'llama3.2:1b', 'created_at': '2025-01-07T06:44:48.5027817Z', 'done': True, 'done_reason': 'stop', 'total_duration': 4930345000, 'load_duration': 3706270200, 'prompt_eval_count': 27, 'prompt_eval_duration': 485000000, 'eval_count': 10, 'eval_duration': 722000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-092b103c-6beb-46f0-afc3-3bf39fa5a484-0', usage_metadata={'input_tokens': 27, 'output_tokens': 10, 'total_tokens': 37})

In [36]:
# create prompt for the llm

prompt = '''
    You are an AI assistant specifically designed to answer questions only related to the context provided to you. 
    Respond in a maximum of 3 lines, keeping the answers concise. 
    Use bullet points if possible, and ensure your answers are relevant to the question and 
    derived solely from the context. 
    Question: {question}
    Context: {context}
    Answer:
'''


In [37]:
prompt = ChatPromptTemplate.from_template(prompt)

In [38]:
def format_docs(docs):  # context for the question
    return "\n\n".join([doc.page_content for doc in docs])

In [40]:
# print(format_docs(docs))

In [43]:
rag_chain = (
    {"context" : retriver|format_docs, "question":RunnablePassthrough()}
    |prompt
    |model
    |StrOutputParser()
)

In [44]:
question = "what is used to reduce the weight"
# question = "what are the benefits of the supplements"

output = rag_chain.invoke(question)

In [45]:
print(type(output))

<class 'str'>


In [46]:
print(output)

• The majority of weight loss supplements used by young women, regardless of their weight [15], contain over 4000 individual substances, making it challenging to determine the effects on the body. 
• Some popular ingredients include chromium and chitosan, green tea, Garcinia cambogia, and bitter orange extracts.
