In [2]:
import os
from typing import List

import faiss, chromadb
from llama_index.readers.file import PDFReader
from llama_index.core.schema import Document, TextNode
from llama_index.core.node_parser import SemanticSplitterNodeParser
from langchain_community.embeddings import OllamaEmbeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore
from langchain_community.vectorstores import FAISS
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex


  from .autonotebook import tqdm as notebook_tqdm


### Loading PDF File

In [3]:
file_path="..\\data\\nabh_gib_hos.pdf"

pdf_reader=PDFReader()
documents:List[Document]=pdf_reader.load_data(file_path)

In [None]:
documents

### Semantic Chunking

In [4]:
# embedding_model=OpenAIEmbedding()
# embedding_model=OllamaEmbeddings(model_name="nomic-embed-text",show_progress=True)
embedding_model=HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
semantic_splitter=SemanticSplitterNodeParser(buffer_size=1,embed_model=embedding_model,breakpoint_percentile_threshold=95)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
nodes:List[TextNode]=semantic_splitter.get_nodes_from_documents(documents,show_progress=True)

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.83it/s]
Generating embeddings: 100%|██████████| 11/11 [00:00<00:00, 34.80it/s]
Generating embeddings: 100%|██████████| 22/22 [00:00<00:00, 67.04it/s]
Generating embeddings: 100%|██████████| 12/12 [00:00<00:00, 35.96it/s]
Generating embeddings: 100%|██████████| 3/3 [00:00<00:00, 27.42it/s]
Generating embeddings: 100%|██████████| 12/12 [00:00<00:00, 60.18it/s]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00, 41.71it/s]
Generating embeddings: 100%|██████████| 17/17 [00:00<00:00, 55.76it/s]
Generating embeddings: 100%|██████████| 8/8 [00:00<00:00, 32.39it/s]
Generating embeddings: 100%|██████████| 5/5 [00:00<00:00, 21.08it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00, 42.13it/s]
Generating embeddings: 100%|██████████| 9/9 [00:00<00:00, 41.84it/s]
Generating embeddings: 100%|██████████| 11/11 [00:00<00:00, 34.78it/s]
Generating embeddings: 100%|██████████| 14/14 [00:00<00:00, 58.03it/s]
Generating embedding

### Creating Embeddings and Indexing using FAISS

In [None]:
dim=1536
faiss_index=faiss.IndexFlatL2()

In [None]:
vector_store=FaissVectorStore(faiss_index)
storage_context=StorageContext.from_defaults(vector_store=vector_store)

In [None]:
vector_index=VectorStoreIndex(nodes=nodes, embed_model=embedding_model, storage_context=storage_context)
vector_index.storage_context.persist("../storage")

In [6]:
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("nabh_data")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex(nodes, storage_context=storage_context,embed_model=embedding_model)

### Retrieval

In [None]:
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import Settings

In [None]:
vector_store=FaissVectorStore.from_persist_dir("../storage")
storage_context=StorageContext.from_defaults(vector_store=vector_store,persist_dir="../storage")
vector_index=load_index_from_storage(storage_context)

In [None]:
retriever=vector_index.as_retriever(kwargs={"similarity_top_k":3})

In [None]:
docs=retriever.retrieve(str_or_query_bundle='what are the certification programs provided by nabh')

In [None]:
docs[0].node.text

In [None]:
docs

In [None]:
embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("nabh_collection")

In [None]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
documents

In [None]:
index=VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)

In [None]:
db = FAISS.from_documents(documents, OllamaEmbeddings(model="nomic-embed-text",show_progress=True))

## Retriever using Langchain and FAISS with HuggingFace

In [2]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader

In [3]:
file_path="..\\data\\nabh_gib_hos.pdf"

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
len(pages)

21

In [16]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1250,chunk_overlap=100)

In [17]:
split_docs=text_splitter.split_documents(pages)

In [18]:
len(split_docs)

34

In [19]:
split_docs[10]

Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250227212411', 'source': '..\\data\\nabh_gib_hos.pdf', 'total_pages': 21, 'page': 6, 'page_label': '7'}, page_content='NATIONAL ACCREDITATION BOARD FOR HOSPITALS  \n& HEALTHCARE PROVIDERS (NABH) \n   \nPage 7  \nBenefits of Accreditation \n \n \n  \n \n \nBenefits for Staff \n \n• The staff in an accredited Hospital is satisfied lot as it provides for continuous \nlearning, good working environment and leadership.  \n• Efficiencies and competencies of staff also gets improved in an accredited Hospital.  \n• It improves overall professional de velopment, knowledge and competencies in \nsystematic ways with defined ownership and accountability of all the staff including \nMedical and Para Medical Staff.  \n \nBenefits to paying and regulatory bodies \n \nFinally, accreditation provides an objective system of  empanelment by insurance and \nother third parties. Accreditation provides access to reliable and 

In [20]:
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [21]:
vector_store=FAISS.from_documents(split_docs,embeddings)

In [22]:
vector_store_name="faiss_storage"
vector_store.save_local(vector_store_name)

In [23]:
result=vector_store.similarity_search(query="What are the certification programs provided by nabh?",k=5)

In [None]:
result[4].page_content

'NATIONAL ACCREDITATION BOARD FOR HOSPITALS  \n& HEALTHCARE PROVIDERS (NABH) \n   \nPage 11  \nAccreditation Process'