In [None]:
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
file_path = "D:\Desktop\RAG_Project\data\PCOS_1.pdf"

if not os.path.exists(file_path):
    print(f"Error: PDF file not found at {file_path}.")
else:
    loader = PyPDFLoader(file_path)

    documents = loader.load()

    print(f"--- Document Loading Complete ---")
    print(f"Total pages/initial documents loaded: {len(documents)}")
    print(f"Content of Page 1 (first 200 chars):")
    print(documents[0].page_content[:200].replace('\n', ' '))
    print(f"Metadata (source and page number): {documents[0].metadata}")
    print("---------------------------------")


--- Document Loading Complete ---
Total pages/initial documents loaded: 28
Content of Page 1 (first 200 chars):
Part of the Pathways to Parenthood booklet series Polycystic o vary   syndrome (Pcos) InsIde: • The symptoms explained  • Your treatment options  • Coping emotionally with infertility
Metadata (source and page number): {'producer': 'Adobe PDF Library 9.9', 'creator': 'Adobe InDesign CS5.5 (7.5)', 'creationdate': '2012-12-12T17:39:51+11:00', 'moddate': '2012-12-12T18:01:46+11:00', 'trapped': '/False', 'source': 'D:\\Desktop\\RAG_Project\\data\\PCOS_1.pdf', 'total_pages': 28, 'page': 0, 'page_label': '1'}
---------------------------------


#### --- B. Splitting (Chunking) the Document ---

In [8]:
# Initializing RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap=200
)

In [11]:
# Chunking and splitting 
chunks = text_splitter.split_documents(documents)

In [12]:
# --- C. Inspecting the Results ---
print(f"\n--- Chunking Results ---")
print(f"Total chunks created: {len(chunks)}")
print(f"Original Pages: {len(documents)} -> New Chunks: {len(chunks)}")


--- Chunking Results ---
Total chunks created: 67
Original Pages: 28 -> New Chunks: 67


In [14]:
# lets check overlap
print("\n--- The Importance of Overlap (The overlap is the last 200 chars of Chunk 4)")
print("Chunk 4 content ends with...")
print(chunks[4].page_content[-200:].replace('\n', ' '))
print("Chunk 5 content starts with...")
print(chunks[5].page_content[:200].replace('\n', ' '))


--- The Importance of Overlap (The overlap is the last 200 chars of Chunk 4)
Chunk 4 content ends with...
............8 irregular periods ...................................................................................................................................................................... 9
Chunk 5 content starts with...
irregular periods ...................................................................................................................................................................... 9 difficulty be


# Vector Store Creation

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS # Vectore store we are using

In [16]:
DEFAULT_EMBEDDING = "all-MiniLM-L6-v2"

embedding_model = HuggingFaceEmbeddings(
    model_name = DEFAULT_EMBEDDING
)

In [18]:
vectore_store = FAISS.from_documents(
    chunks,
    embedding_model
)

print(vectore_store)

<langchain_community.vectorstores.faiss.FAISS object at 0x000002573CA47090>


In [21]:
# testing
query = "What percentage of women in Australia are diagnosed with PCOS?"

retriver = vectore_store.as_retriever(search_kwargs={"k":1})

retrieved_docs = retriver.invoke(query)

print(f"\n--- Testing Retrieval for: '{query}' ---")
print(f"Found {len(retrieved_docs)} most relevant documents (chunks).")


--- Testing Retrieval for: 'What percentage of women in Australia are diagnosed with PCOS?' ---
Found 1 most relevant documents (chunks).


In [None]:
for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Document {i+1} (Source: {doc.metadata.get('source', 'Unknown')}, Page: {doc.metadata.get('page', 'Unknown')}) ---")
    print(doc.page_content[:400] + "...") 


--- Document 1 (Source: D:\Desktop\RAG_Project\data\PCOS_1.pdf, Page: 5) ---
4
Who gets Pcos?
in australia up to 11% of all women are diagnosed with Pcos. 2 this translates to Pcos  
affecting about 400,000 a ustralian women of reproductive age. 3, 4 in indigenous a ustralian 
women, the occurrence of Pcos is much higher – around 21%. 2 it is thought that Pcos will 
likely increase in line with the rising rate of obesity in a ustralia. 
Why does it occur?
doctors are not e...
