In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import pandas as pd
# from langchain_community.embeddings import OllamaEmbeddings


In [3]:
# Step 1: Load PDF and split into chunks
pdf_path = "Health_and_Disease_A_Lifespan_Approach.pdf"  # replace with your PDF file
loader = PyPDFLoader(pdf_path)
pages = loader.load()

In [None]:
# Step 2: Token counting with tiktoken
# Use a model encoding — gpt-3.5-turbo is a good default
# Gives an idea of how many tokens each page will use
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

token_counts = []
for i, page in enumerate(pages, start=1):
    tokens = encoding.encode(page.page_content)
    token_counts.append((f"Page{i}", len(tokens)))

pd.DataFrame(token_counts, columns=["Page", "TokenCount"]).to_csv("page_token_counts.csv", index=False)


In [4]:
# Step 3: Split pages into chunks for embedding
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=20 )
docs = splitter.split_documents(pages)



In [None]:
# Create embeddings
embedding = OllamaEmbeddings(model="nomic-embed-text")


In [8]:
# embedding.embed_documents([doc.page_content for doc in a])

In [None]:
# Add a single doc to FAISS and store the index
# And store the index to local storage, which can be reused later
db = FAISS.from_documents(docs, embedding)
db.save_local("faiss_index_pdf") 


In [None]:
# Load the FAISS index and get a retriever
db_loaded = FAISS.load_local("faiss_index_pdf", embedding, allow_dangerous_deserialization=True)



In [None]:
# Data can be retrieved in 2 ways:
# 1. Using the default retriever

#retriever = db_loaded.as_retriever() # By default top 4 docs are retrieved

retriever = db_loaded.as_retriever(search_type="similarity_score_threshold", 
                                 search_kwargs={"score_threshold": .1, 
                                                "k": 2})


In [None]:
# Example: fetch relevant docs for a query
query = "How are you?"
results = retriever.get_relevant_documents(query)
results

[Document(id='ed3742a5-f6b4-4be0-8176-995137ab0791', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 187, 'page_label': '188'}, page_content='3\n)\tis\ta\tmarker\tof\trelative\tthinness\t(low\tponderal\nindex)\tor\tfatness\t(high\tponderal\tindex)\tat\tbirth.\tPonderal\tindex\tat\tbirth\twas\talso\ta\tsignificant\npredictor\tof\tblood\tpressure.\tThis\tdata\ttherefore\tshowed\tthat\tbabies\twho\twere\tborn\tsmall\tand\nthin\twere\tat\tgreater\trisk\tof\tdisease\tin\tlater\tlife.\tThe\tUS\tNurses\tHealth\tStudy\tcollected\tdata\ton'),
 Document(id='43cd152f-204f-46d8-b40e-ed3a21a866b2', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 114, 'page_label': '115'}, page_content='maintain\tand\tsupport\tthe\tpregnancy,\tprevent\timmunological\trej

In [33]:

query = "How are you?"
retriever.invoke(query,kwargs={"k": 4})


[Document(id='ed3742a5-f6b4-4be0-8176-995137ab0791', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 187, 'page_label': '188'}, page_content='3\n)\tis\ta\tmarker\tof\trelative\tthinness\t(low\tponderal\nindex)\tor\tfatness\t(high\tponderal\tindex)\tat\tbirth.\tPonderal\tindex\tat\tbirth\twas\talso\ta\tsignificant\npredictor\tof\tblood\tpressure.\tThis\tdata\ttherefore\tshowed\tthat\tbabies\twho\twere\tborn\tsmall\tand\nthin\twere\tat\tgreater\trisk\tof\tdisease\tin\tlater\tlife.\tThe\tUS\tNurses\tHealth\tStudy\tcollected\tdata\ton'),
 Document(id='43cd152f-204f-46d8-b40e-ed3a21a866b2', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 114, 'page_label': '115'}, page_content='maintain\tand\tsupport\tthe\tpregnancy,\tprevent\timmunological\trej

In [None]:
# 2. Using similarity search with scores (often using cosine similarity or L2 distance)
# This returns the top k documents with their similarity scores
# Simple and fast.
# Good for direct similarity-based retrieval.

query = "How are you?"
a = db_loaded.similarity_search_with_score(query,k=2)
a

[(Document(id='ed3742a5-f6b4-4be0-8176-995137ab0791', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 187, 'page_label': '188'}, page_content='3\n)\tis\ta\tmarker\tof\trelative\tthinness\t(low\tponderal\nindex)\tor\tfatness\t(high\tponderal\tindex)\tat\tbirth.\tPonderal\tindex\tat\tbirth\twas\talso\ta\tsignificant\npredictor\tof\tblood\tpressure.\tThis\tdata\ttherefore\tshowed\tthat\tbabies\twho\twere\tborn\tsmall\tand\nthin\twere\tat\tgreater\trisk\tof\tdisease\tin\tlater\tlife.\tThe\tUS\tNurses\tHealth\tStudy\tcollected\tdata\ton'),
  np.float32(1.0390892)),
 (Document(id='43cd152f-204f-46d8-b40e-ed3a21a866b2', metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250711133424', 'source': 'Health_and_Disease_A_Lifespan_Approach.pdf', 'total_pages': 567, 'page': 114, 'page_label': '115'}, page_content='maintain\tand\tsupport\tthe\tpregnancy,\