In [2]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ------------------------
# Load API keys
# ------------------------
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


# ------------------------
# Step 1: Load PDF
# ------------------------
pdf_path = "data/attention-is-all-you-need-Paper.pdf"  # relative path
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# ------------------------
# Step 2: Split text into chunks
# ------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# ------------------------
# Step 3: Embeddings model
# ------------------------
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# ------------------------
# Step 4: Create Chroma DB & Store
# ------------------------
persist_directory = "chroma_store"  # folder to save Chroma DB files

vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_directory
)

# Save the Chroma DB to disk
vector_store.persist()
print(f" Stored {len(chunks)} chunks in Chroma DB at '{persist_directory}'.")

# ------------------------
# Step 5: Similarity Search
# ------------------------
query = "What is the main topic of the document?"
results = vector_store.similarity_search(query, k=3)

print("\n Search Results:")
for i, res in enumerate(results, start=1):
    print(f"{i}. {res.page_content[:200]}...\n")


  vector_store.persist()


 Stored 51 chunks in Chroma DB at 'chroma_store'.

 Search Results:
1. results to the base model.
7 Conclusion
In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on
attention, replacing the recurrent layers most commonly used ...

2. textual entailment and learning task-independent sentence representations [4, 22, 23, 19].
End-to-end memory networks are based on a recurrent attention mechanism instead of sequence-
aligned recurren...

3. mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine transla...

