In [None]:
from langchain_community.document_loaders import PyPDFLoader

#1. load pdf
loader=PyPDFLoader("data/chase_banking.pdf")
documents= loader.load()
print(documents)

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2024-10-03T09:29:06-04:00', 'author': 'JPMorgan Chase Bank', 'keywords': 'Chase; total; checking; guide to your account; ada; (PDF)', 'moddate': '2024-10-07T09:59:35-04:00', 'subject': 'Chase Total Checking - A Guide To Your Account', 'title': 'Chase Total Checking - A Guide To Your Account (PDF)', 'trapped': '/Unknown', 'source': 'data/chase_banking.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='HAVE QUESTIONS? CALL US AT 1-800-935-9935 (WE ACCEPT OPERATOR RELAY CALLS)  • WANT MORE INFO?  SEE THE DEPOSIT ACCOUNT AGREEMENT\n1\nCHASE TOTAL CHECKING\n®\nA GUIDE TO YOUR ACCOUNT †\nIt’s important that you understand how your Chase Total Checking account works. \nWe’ve created this Guide to explain the fees and some key terms of your personal account.\nMONTHLY \nSERVICE FEE*\nMonthly Service Fee $12\nWays to Avoid the \nMonthly Service Fee\n$0 Monthl

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

#2. split into chunks

text_splitter = RecursiveCharacterTextSplitter(chunk_size= 500,chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [8]:
print(docs)

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2024-10-03T09:29:06-04:00', 'author': 'JPMorgan Chase Bank', 'keywords': 'Chase; total; checking; guide to your account; ada; (PDF)', 'moddate': '2024-10-07T09:59:35-04:00', 'subject': 'Chase Total Checking - A Guide To Your Account', 'title': 'Chase Total Checking - A Guide To Your Account (PDF)', 'trapped': '/Unknown', 'source': 'data/chase_banking.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'chunk_index': 0}, page_content='HAVE QUESTIONS? CALL US AT 1-800-935-9935 (WE ACCEPT OPERATOR RELAY CALLS)  • WANT MORE INFO?  SEE THE DEPOSIT ACCOUNT AGREEMENT\n1\nCHASE TOTAL CHECKING\n®\nA GUIDE TO YOUR ACCOUNT †\nIt’s important that you understand how your Chase Total Checking account works. \nWe’ve created this Guide to explain the fees and some key terms of your personal account.\nMONTHLY \nSERVICE FEE*\nMonthly Service Fee $12\nWays to Avoid the \nMonthly Serv

In [7]:
#3. add meta data

for i,doc in enumerate(docs):
    doc.metadata["chunk_index"]=i
print(doc)

page_content='JPMorgan Chase Bank, N.A. Member FDIC
© 2024 JPMorgan Chase & Co.
Effective 11/17/2024
FEETABLE-TOTAL-1124-ONLN' metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2024-10-03T09:29:06-04:00', 'author': 'JPMorgan Chase Bank', 'keywords': 'Chase; total; checking; guide to your account; ada; (PDF)', 'moddate': '2024-10-07T09:59:35-04:00', 'subject': 'Chase Total Checking - A Guide To Your Account', 'title': 'Chase Total Checking - A Guide To Your Account (PDF)', 'trapped': '/Unknown', 'source': 'data/chase_banking.pdf', 'total_pages': 4, 'page': 3, 'page_label': '4', 'chunk_index': 41}


In [10]:
import os
#4. embedding model
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"]= os.getenv("OPENAI_API_KEY")
embedding_model = OpenAIEmbeddings()

  embedding_model = OpenAIEmbeddings()


In [None]:
#5. Create FAISS vectorstore in memory(will be lost after excucution)

faiss_db = FAISS.from_documents(docs,embedding_model)

In [None]:
#6. save FAISS index(store premenantly or persistant)
faiss_db.save_local("faiss_index_dir")

In [None]:
#7. Reload FAISS index( loaded to memory)

loaded_db = FAISS.load_local("faiss_index_dir",embeddings=embedding_model,allow_dangerous_deserialization=True)

In [15]:
#8. Search
query="What is the main idea of the document?"
results = loaded_db.similarity_search(query,k=3)

In [16]:
#9. Printing results

for i,doc in enumerate(results):
    print(f"\n--- Result{i+1} ---")
    print("Content:",doc.page_content[:300],"...")
    print("Metadata:", doc.metadata)


--- Result1 ---
Content: exchange rates, refer to the 
Deposit Account Agreement
See the next page for other fees that may apply. ...
Metadata: {'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2024-10-03T09:29:06-04:00', 'author': 'JPMorgan Chase Bank', 'keywords': 'Chase; total; checking; guide to your account; ada; (PDF)', 'moddate': '2024-10-07T09:59:35-04:00', 'subject': 'Chase Total Checking - A Guide To Your Account', 'title': 'Chase Total Checking - A Guide To Your Account (PDF)', 'trapped': '/Unknown', 'source': 'data/chase_banking.pdf', 'total_pages': 4, 'page': 2, 'page_label': '3', 'chunk_index': 32}

--- Result2 ---
Content: JPMorgan Chase Bank, N.A. Member FDIC
© 2024 JPMorgan Chase & Co.
Effective 11/17/2024
FEETABLE-TOTAL-1124-ONLN ...
Metadata: {'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2024-10-03T09:29:06-04:00', 'author': 'JPMorgan Chase Bank', 'keywords': 