In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [15]:
DATA_PATH='apple_products_data'
DB_FAISS_PATH='vector_db'

In [16]:
loader = DirectoryLoader(DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader)
documents = loader.load()
print(f" Loaded {len(documents)} documents from {DATA_PATH}")

 Loaded 41 documents from apple_products_data


In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

print(f" Created {len(texts)} text chunks.")
print("First chunk:", texts[0].page_content[:100])

 Created 504 text chunks.
First chunk: File: all_data_doc.txt Page 1 of 41
iPhone 16 Pro – Detailed Specifications
General Information:
- M


In [18]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
sample_text= "this is a text sentence."
vector = embeddings.embed_query(sample_text)

print(f" embedding vector size: {len(vector)}")
print(f" first 5 values: {vector[:5]}")

 embedding vector size: 384
 first 5 values: [0.06502405554056168, 0.11340225487947464, 0.021272556856274605, 0.0413065105676651, 0.02247108519077301]


In [19]:
db = FAISS.from_documents(texts, embeddings)

query= " Iphone 16 "
results = db.similarity_search(query,k=3)

print(f" found {len(results)} similar documents. ")
print(' first results: ', results[0].page_content[:200])

 found 3 similar documents. 
 first results:  General Information:
- Model Name: iPhone 16
- Release Date: September 2024
- Price (India): ₹79,900 for 128GB variant (source: apple.com/in)
- Operating System: iOS 18
Design & Build:
- Material: Alu


In [20]:
db.save_local(DB_FAISS_PATH)