In [None]:
# 📌 Step 1: Loading Required Libraries
from langchain_community.document_loaders import PDFPlumberLoader   # For loading PDFs
from langchain_text_splitters import RecursiveCharacterTextSplitter # For splitting text
from langchain_community.embeddings import HuggingFaceEmbeddings    # FREE embeddings model
from langchain_community.vectorstores import FAISS                  # FAISS for vector storage and retrieval

In [8]:
# 📌 Step 2: Loading PDF Document
# This will extract text from a given PDF file.
pdf_path = "Aromatase Paper.pdf"
loader = PDFPlumberLoader(pdf_path)
pdf_documents = loader.load()


In [14]:
# 📌 Step 3: Spliting Text into Chunks
# Many AI models have token limits, so we need to split the text into smaller parts.
# This method ensures that important information is not lost by allowing chunk overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000,chunk_overlap=500)
documents = text_splitter.split_documents(pdf_documents)

In [29]:
# 📌 Step 4: Generate Embeddings using a FREE Hugging Face Model
# OpenAI embeddings require an API key and may have usage limits.
# Hugging Face’s "all-MiniLM-L6-v2" is a lightweight, efficient, and FREE alternative.
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-MiniLM-L12-v2")


In [30]:
# 📌 Step 5: Store Vectorized Data in FAISS (Fast and Scalable)
# FAISS (Facebook AI Similarity Search) is an optimized library for searching embeddings.
# It is lightweight and does not require external servers like ChromaDB.
db = FAISS.from_documents(documents[:20], embeddings)  # Storing the first 20 document chunks

In [49]:
# 📌 Step 6: Perform a Similarity Search
# Here, we query the FAISS vector database to find the most relevant document chunk.
query = "What is the paper about?"
result = db.similarity_search(query=query,k=3)

In [53]:
# 📌 Step 7: Display the Most Relevant Search Result
# Limit response to 800 characters (can be adjusted as needed)
short_response = result[0].page_content[:800]
print("🔍 FAISS Search Result:\n", short_response)

🔍 FAISS Search Result:
 new england journal medicine
The of
ovulation in women with infertility.34 The data in
the current review, however, pertain solely to post-
Androstenedione Testosterone
menopausal women.
Peripheral tissues (subcutaneous
fat, liver, muscle, or brain) clinical development
and pharmacology
Aromatase
Aromatase Aromatase
inhibitors
Aminoglutethimide, the first aromatase inhibitor,
was initially developed as an anticonvulsant but was
withdrawn from use after reports of adrenal insuf-
Estrone
ficiency. It was subsequently found to inhibit several
cytochrome P-450 enzymes involved in adrenal ste-
Estradiol roidogenesis and was then redeveloped for use as
“medical adrenalectomy” against advanced breast
cancer.35,36 Side effects, including drowsiness and
Tamoxifen rash, limited its use, but the disc


In [6]:
# Limiting resonses as needed.

In [47]:
query = "What is breast cancer ?"
result = db.similarity_search(query=query,k=1)

In [48]:
print("🔍 FAISS Search Result:\n", result[0].page_content)

🔍 FAISS Search Result:
 gery63,64 (Table 2). large, estrogen-receptor–positive cancers.
There was also an unexpected and potentially
important finding in a subgroup of patients whose Adjuvant Therapy
tumors were available for further analysis: of 17 pa- Tamoxifen given for approximately five years after
tients whose tumors overexpressed the cell-surface surgery to patients with early, estrogen-receptor–
growth factor receptor c-ErbB-2 (HER2), c-ErbB-1 positive breast cancer is the current standard of care
(epidermal growth factor receptor [EGFR]) or both, worldwide. This approach reduces the risk of death
15 (88 percent) had a response to letrozole, as com- by about 25 percent, a reduction that translates
pared with only 4 of 19 (21 percent) with a response into an absolute improvement in 10-year survival
to tamoxifen (Table 2).64 These findings are consis- of more than 10 percent for patients with involved
tent with the in vitro and in vivo observations that nodes and 5 percent for th