In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [3]:
DATA_PATH='apple_product_texts'
DB_FAISS_PATH='vector_db'

In [5]:
loader = DirectoryLoader(DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader)
documents = loader.load()
print(f" Loaded {len(documents)} documents from {DATA_PATH}")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


 Loaded 392 documents from apple_product_texts


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

print(f" Created {len(texts)} text chunks.")
print("First chunk:", texts[0].page_content[:200])

 Created 1621 text chunks.
First chunk: MacBook Air (15-inch, M3, 2024) - Tech Specs
StoreShopShop the LatestMaciPadiPhoneApple WatchAccessoriesQuick LinksFind a StoreOrder
StatusWays to BuyPersonal SetupShop Special StoresEducationBusiness


In [7]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
sample_text= "this is a text sentence."
vector = embeddings.embed_query(sample_text)

print(f" embedding vector size: {len(vector)}")
print(f" first 5 values: {vector[:5]}")

  from .autonotebook import tqdm as notebook_tqdm


 embedding vector size: 384
 first 5 values: [0.06502405554056168, 0.11340225487947464, 0.021272556856274605, 0.0413065105676651, 0.02247108519077301]


In [9]:
db = FAISS.from_documents(texts, embeddings)

query= " Iphone 16 "
results = db.similarity_search(query,k=3)

print(f" found {len(results)} similar documents. ")
print(' first results: ', results[0].page_content[:200])

 found 3 similar documents. 
 first results:  iPhone 16 Plus - Tech Specs
StoreShopShop the LatestMaciPadiPhoneApple WatchAccessoriesQuick LinksFind a StoreOrder
StatusWays to BuyPersonal SetupShop Special StoresEducationBusinessMacExplore MacExp


In [10]:
db.save_local(DB_FAISS_PATH)