In [None]:
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer


Getting text from pdf

In [None]:
def text_to_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text = full_text+page.get_text()
    return full_text

converting them into document

In [None]:
def document(text):
    return [Document(page_content=text)]

Splitting them into chunks

In [None]:
def split_chunks(docs,chunk_size,chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)

Storing in FAISS

In [None]:
def embeded_store(chunks,embedding_model):
    return FAISS.from_documents(chunks,embedding_model)

Searching

In [None]:
def search_and_print(db,query,label):
    print(f"\n\nresult for the chunk size: {label} ---")
    results = db.similarity_search(query,k=3)\

    count = 1
    for result in results:
        print("Result",count)
        print(result.page_content[:500])
        count = count +1

In [None]:
pdf_path = "C:/Users/sayhe/PycharmProjects/Summer_internship/artificial_intelligence.pdf"
query = input("What do you want to search in pdf : ")

print("Extracting the texts...")
text=text_to_pdf(pdf_path)
docs = document(text)

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)



In [None]:
chunks_256 = split_chunks(docs,chunk_size=256)
chunks_512 = split_chunks(docs,chunk_size=512)
chunks_1024 = split_chunks(docs,chunk_size=1024)

print("embedding and storing the chunks...")


In [None]:
db_256 = embeded_store(chunks_256,embedding_model)
db_512 = embeded_store(chunks_512,embedding_model)
db_1024 = embeded_store(chunks_1024,embedding_model)

In [None]:
search_and_print(db_256,query,"256 tokens")
search_and_print(db_512,query,"512 tokens")
search_and_print(db_1024,query,"1024 tokens")