In [1]:
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from semantic_text_splitter import TextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
# import faiss
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_pdf_from_folder(folder_path: str):
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(folder_path, file)
            try:
                loader = PyMuPDFLoader(file_path)
                docs = loader.load()
                
                if not any(doc.page_content.strip() for doc in docs):
                    raise ValueError("Empty text, fallback to OCR")

            except Exception:
                print(f"couldent load {file}")
            documents.extend(docs)
    return documents

In [3]:
docs = load_pdf_from_folder("D:\Machine Learning Practice\Pdf data")

couldent load allen H.B - Thermodynamics And An Introduction To Thermostatistics-Wiley (1985).pdf
couldent load BMP notes.pdf
couldent load classical-mechanics-j-c-upadhyay-2014-edition.pdf
couldent load Classical_Electrodynamics_Jackson_1a_Edition.pdf
couldent load dummit-amp-footex27s-algebra-pr_ff0ec52449105ac0359a55c3c10fbbd3.pdf
couldent load Francis A. Jenkins, Harvey E. White - Fundamentals of Optics, Fourth Edition     (2001, McGraw-Hill_.pdf
couldent load Griffiths - Introduction to quantum mechanics.pdf
couldent load real-analysis-by-bartle.pdf
couldent load the-feynman-lectures-on-physics-vol2-pr_535b8c931c7110f49cd675c08ed78b62.pdf
couldent load the-feynman-lectures-on-physics-vol3-pr_669bff2604e3c5241128c53adcb99a0d.pdf
MuPDF error: library error: FT_New_Memory_Face(GOOEBM+MSBM10): unknown file format

MuPDF error: library error: FT_New_Memory_Face(GPFBMM+MSBM7): unknown file format

MuPDF error: library error: FT_New_Memory_Face(GPHPAJ+stmary7): unknown file format

MuPDF

In [4]:
print(docs[100])

page_content='84
CHAPTER 2. MATHEMATICAL TOOLS OF QUANTUM MECHANICS
2.2.4
Square-Integrable Functions: Wave Functions
In the case of function spaces, a “vector” element is given by a complex function and the scalar
product by integrals. That is, the scalar product of two functions Ox and Mx is given by
O M 
=
O`xMx dx
(2.21)
If this integral diverges, the scalar product does not exist. As a result, if we want the function
space to possess a scalar product, we must select only those functions for which O M is ﬁnite.
In particular, a function Ox is said to be square integrable if the scalar product of O with
itself,
O O 
=
Ox2 dx
(2.22)
is ﬁnite.
It is easy to verify that the space of square-integrable functions possesses the properties of
a Hilbert space. For instance, any linear combination of square-integrable functions is also a
square-integrable function and (2.21) satisﬁes all the properties of the scalar product of a Hilbert
space.
Note that the dimensio

In [5]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap, separators= ["\n\n", "\n", " ", ""] )
    split = text_splitter.split_documents(documents=documents)
    return split

In [6]:
chunks = split_documents(documents=docs)

In [7]:
print(len(chunks))

40774


In [8]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={"device": "cuda"})
# vectordb = Chroma.from_documents(documents=chunks, embedding= embedding_model,persist_directory="./chroma_db")
# vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
# batch_size =64
# for i in range(0, len(chunks),batch_size):
#     batch = chunks[i:i+batch_size]
#     try:
#         vectordb.add_documents(batch)
#     except Exception as e:
#         print(f"Error adding batch {i} to {i+batch_size}: {e}")
#         break
# vectordb.persist()

# vectordb = FAISS.from_documents(chunks, embedding_model)
# FAISS.save_local(vectordb, "./faiss_db")

batch_size = 500  # safe batch size
vectordb = None
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i+batch_size]
    if vectordb is None:
        vectordb = FAISS.from_documents(batch, embedding_model)
    else:
        vectordb.add_documents(batch)
    print(f"Processed chunks {i} to {i + len(batch)}")

FAISS.save_local(vectordb, "./faiss_db")
print("FAISS vector store saved successfully!")


Processed chunks 0 to 500
Processed chunks 500 to 1000
Processed chunks 1000 to 1500
Processed chunks 1500 to 2000
Processed chunks 2000 to 2500
Processed chunks 2500 to 3000
Processed chunks 3000 to 3500
Processed chunks 3500 to 4000
Processed chunks 4000 to 4500
Processed chunks 4500 to 5000
Processed chunks 5000 to 5500
Processed chunks 5500 to 6000
Processed chunks 6000 to 6500
Processed chunks 6500 to 7000
Processed chunks 7000 to 7500
Processed chunks 7500 to 8000
Processed chunks 8000 to 8500
Processed chunks 8500 to 9000
Processed chunks 9000 to 9500
Processed chunks 9500 to 10000
Processed chunks 10000 to 10500
Processed chunks 10500 to 11000
Processed chunks 11000 to 11500
Processed chunks 11500 to 12000
Processed chunks 12000 to 12500
Processed chunks 12500 to 13000
Processed chunks 13000 to 13500
Processed chunks 13500 to 14000
Processed chunks 14000 to 14500
Processed chunks 14500 to 15000
Processed chunks 15000 to 15500
Processed chunks 15500 to 16000
Processed chunks 160