In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [6]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl (303 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pypdf
Successfully installed pypdf-5.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
DATA_PATH="data/"
def load_pdf_files(data):
    loader = DirectoryLoader(data,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)
    
    documents=loader.load()
    return documents

In [3]:
documents=load_pdf_files(data=DATA_PATH)
#print("Length of PDF pages: ", len(documents))

In [4]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=create_chunks(extracted_data=documents)

In [6]:
print(f"Total chunks created: {len(text_chunks)}")
print(" Sample chunk preview:")
print(text_chunks[0].page_content[:300])  # Show first 300 characters of first chunk

Total chunks created: 41267
 Sample chunk preview:
The GALE
ENCYCLOPEDIA of
MEDICINE
THIRD EDITION


In [5]:
def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

embedding_model=get_embedding_model()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
sample_embedding = embedding_model.embed_documents([text_chunks[0].page_content])
print(f"Sample embedding length: {len(sample_embedding[0])}")

Sample embedding length: 384


In [8]:
DB_FAISS_PATH = "vectorstore/db_faiss"
db = FAISS.from_documents(text_chunks, embedding_model)
db.save_local(DB_FAISS_PATH)