In [1]:
# Import necessary libraries for document loading
import os  # For working with files and directories
import docx2txt  # For converting Word documents to text
from langchain_community.document_loaders import PyPDFLoader  # For loading text documents
from langchain_text_splitters import RecursiveCharacterTextSplitter  # For splitting text into chunks
from langchain_core.documents import Document  # For creating document objects

In [None]:
file_paths=[r"C:\Users\Admin\Desktop\Strands\data\AWS Certified Cloud Practitioner Slides v41.pdf",r"C:\Users\Admin\Desktop\Strands\data\AWS_Certified AI Practitioner_ Slides.pdf"]

for file in file_paths:

    loader = PyPDFLoader(file)

    docs = loader.load()

    print(len(docs))

In [7]:
# Split documents into chunks
# We split large documents into smaller chunks to make search more effective
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Each chunk will be about 500 characters
    chunk_overlap=100  # Chunks will overlap by 100 characters to maintain context
)

# Apply the splitter to our documents
splits = text_splitter.split_documents(docs)
print(f"Created {len(splits)} document chunks")

Created 660 document chunks


In [13]:
import os
from langchain_openai import OpenAIEmbeddings  # LangChain’s OpenAI embeddings

# Make sure your OpenAI API key is set
# Either via environment variable, or pass it explicitly
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Create the embedding model object using OpenAI
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-large",  # or choose another embedding model
    # dimensions=1024,  # optional: specify embedding size for some models
)


In [16]:
from langchain_community.vectorstores.faiss import FAISS

# Suppose `documents` is a list of LangChain Document objects or plain strings
vectorstore = FAISS.from_documents(splits, embedding_model)

# To persist (optional)
persist_directory = "vectordb_openai/"
vectorstore.save_local(persist_directory)

# To load later
# loaded_vs = FAISS.load_local(persist_directory, embedding_model)


In [17]:
# Test a simple query
query = "What aws sagemaker"  # Our test question
results = vectorstore.similarity_search(query, k=3)  # Get the top 3 most relevant results

print("Query:", query)
print("\nTop 3 results:")
for i, doc in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Source: {doc.metadata['source']}")  # Which restaurant this is from
    print(f"Content: {doc.page_content[:200]}...")  # Show the first 200 characters

Query: What aws sagemaker

Top 3 results:

Result 1:
Source: C:\Users\Admin\Desktop\Strands\data\AWS_Certified AI Practitioner_ Slides.pdf
Content: © Stephane Maarek
NOT FOR DISTRIBUTION © Stephane Maarek www.datacumulus.com 
Amazon SageMaker...

Result 2:
Source: C:\Users\Admin\Desktop\Strands\data\AWS_Certified AI Practitioner_ Slides.pdf
Content: © Stephane Maarek
NOT FOR DISTRIBUTION © Stephane Maarek www.datacumulus.com 
Amazon SageMaker AI•Fully managed service for developers / data scientists to build ML models•Typically, difficult to do a...

Result 3:
Source: C:\Users\Admin\Desktop\Strands\data\AWS_Certified AI Practitioner_ Slides.pdf
Content: https://aws.amazon.com/blogs/machine-learning/running-principal-component-analysis-in-amazon-sagemaker/...
