In [None]:
# !git clone https://github.com/laxmimerit/rag-dataset

Cloning into 'rag-dataset'...


In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
import os

In [5]:
loader = PyMuPDFLoader(r"rag-dataset\gym supplements\1. Analysis of Actual Fitness Supplement.pdf")

In [9]:
pdfs = []
for root, dirs, files in os.walk("rag-dataset"):
    for file in files:
        if file.endswith(".pdf"):
            pdfs.append(os.path.join(root, file))

In [None]:
pdfs

['rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf',
 'rag-dataset\\gym supplements\\2. High Prevalence of Supplement Intake.pdf',
 'rag-dataset\\health supplements\\1. dietary supplements - for whom.pdf',
 'rag-dataset\\health supplements\\2. Nutraceuticals research.pdf',
 'rag-dataset\\health supplements\\3.health_supplements_side_effects.pdf']

In [12]:
docs = []

for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    temp = loader.load()
    docs.extend(temp)

In [14]:
len(docs)

64

### Chunking

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
chunks = text_splitter.split_documents(docs)

In [25]:
print(len(chunks[0].page_content))
print(len(chunks[1].page_content))
print(len(chunks[2].page_content))
print(len(chunks[3].page_content))
print(len(chunks[4].page_content))

981
954
973
984
759


### Document Vector Embedding

In [43]:
from langchain_community.embeddings import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [32]:
embeddings = OllamaEmbeddings(
    model = "nomic-embed-text",
    base_url = "http://localhost:11434"
)

  embeddings = OllamaEmbeddings(


In [35]:
vectors = embeddings.embed_query(chunks[0].page_content)

### Storing Embedding in Vector Store
- Get the Index
- Create the Vector Store

In [None]:
index = faiss.IndexFlatIP(len(vectors))
{index.ntotal, index.d}

{0, 768}

In [44]:
vector_store = FAISS(
    embedding_function = embeddings.embed_query,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id = {}
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [45]:
ids = vector_store.add_documents(documents = chunks)

In [46]:
len(ids), vector_store.index.ntotal

(321, 321)

In [47]:
question = "how to get muscle mass ?"
results = vector_store.search(query = question, k = 5, search_type = "similarity")
results

[Document(id='5c726ac9-b21e-42c2-8795-3824ca1de26b', metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'file_path': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'total_pages': 15, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T11:38:50+00:00', 'trapped': '', 'modDate': 'D:20241021113850Z', 'creationDate': '', 'page': 1}, page_content='prevalence, methods, and reasons for supplement consumption, it is evident from the\nfindings that supplement consumption is prevalent among athletes and even more so in\nindividuals competing at higher levels. It can also be confirmed that most gym-goers\ngenerally use dietary supplements, with a prevalence exceeding 40%, with a prevalence of\n44% in Portugal and 81% in South Africa [8]. In a set of different studies, the prevalence\namong gym-goers ranged from 2

In [48]:
db_name = "health_supplements"
vector_store.save_local(db_name)