In [1]:
# Standard library imports
import os
import sys
from pathlib import Path

# LangChain Document Loaders
from langchain_community.document_loaders import PyPDFLoader

# LangChain Text Splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Ollama Integration
from langchain_ollama import OllamaEmbeddings, ChatOllama

# ChromaDB Vector Store
from langchain_chroma import Chroma

# LangChain Core Components
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

print("‚úì All imports successful!")
print("‚úì Ready for local offline RAG!")
print(f"\nPython version: {sys.version}")

‚úì All imports successful!
‚úì Ready for local offline RAG!

Python version: 3.12.10 (main, Apr  9 2025, 04:06:22) [MSC v.1943 64 bit (AMD64)]


In [2]:
!ollama list

NAME                     ID              SIZE      MODIFIED    
gemma3:1b                8648f39daa8f    815 MB    3 weeks ago    
nomic-embed-text:v1.5    0a109f422b47    274 MB    3 weeks ago    


In [3]:
# Test Ollama connection with a simple query
print("Testing Ollama connection...\n")

try:
    test_llm = ChatOllama(model="gemma3:1b", temperature=0)
    response = test_llm.invoke("Say 'Hello! I am running locally on your machine!'")
    
    print("‚úì Ollama is working!")
    print(f"Response: {response.content}")
    
    print("Ollama connection is successfull")
    
except Exception as e:
    print(f"‚úó Error connecting to Ollama: {e}")
    print("\nMake sure Ollama is running. Try: ollama serve")

Testing Ollama connection...

‚úì Ollama is working!
Response: Hello! I am running locally on your machine! üòä

Ollama connection is successfull


In [4]:
# ===== CONFIGURATION: Update this path to your PDF file =====
pdf_path = "attention.pdf"  # Change this to your PDF file path
# =============================================================

# Check if file exists
if not os.path.exists(pdf_path):
    print(f"‚ö†Ô∏è  ERROR: File '{pdf_path}' not found!")
    print("Please update the pdf_path variable with your PDF file location.")
else:
    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # Display information
    print(f"‚úì Loaded {len(documents)} pages from '{pdf_path}'")
    print(f"\n--- First Page Preview ---")
    print(f"Content (first 300 chars): {documents[0].page_content[:300]}...")
    print(f"\nMetadata: {documents[0].metadata}")
    print(f"\nTotal characters: {sum(len(doc.page_content) for doc in documents):,}")

‚úì Loaded 15 pages from 'attention.pdf'

--- First Page Preview ---
Content (first 300 chars): Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Par...

Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}

Total characters: 39,587


In [5]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,        # Characters per chunk
    chunk_overlap=128,      # Overlap to maintain context
    length_function=len,
    separators=["\n\n", "\n", " ", ""]  # Split on paragraphs, then lines, etc.
)

# Split documents
chunks = text_splitter.split_documents(documents)

# Display results
avg_chunk_size = sum(len(chunk.page_content) for chunk in chunks) / len(chunks)

print(f"‚úì Split {len(documents)} documents into {len(chunks)} chunks")
print(f"\nAverage chunk size: {avg_chunk_size:.0f} characters")

# Preview chunks
print(f"\n--- Chunk Examples ---")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} (length: {len(chunk.page_content)} chars):")
    print(f"{chunk.page_content[:200]}...")

‚úì Split 15 documents into 49 chunks

Average chunk size: 873 characters

--- Chunk Examples ---

Chunk 1 (length: 986 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...

Chunk 2 (length: 944 chars):
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more pa...

Chunk 3 (length: 986 chars):
‚àóEqual contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started
the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Tra...


In [7]:
# Initialize Ollama Embeddings with nomic-embed-text
embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5",
    # base_url="http://localhost:11434"  # Default Ollama URL
)

# Test embeddings
print("Testing nomic-embed-text embeddings...\n")
sample_text = "This is a test sentence for embeddings."
sample_embedding = embeddings.embed_query(sample_text)

print(f"‚úì Embeddings model: nomic-embed-text")
print(f"‚úì Embedding dimension: {len(sample_embedding)}")
print(f"‚úì Sample embedding (first 10 values): {sample_embedding[:10]}")
print(f"\n‚ÑπÔ∏è  Each chunk will be converted to a {len(sample_embedding)}-dimensional vector")
print(f"‚ÑπÔ∏è  All processing happens locally on your machine!")

Testing nomic-embed-text embeddings...

‚úì Embeddings model: nomic-embed-text
‚úì Embedding dimension: 768
‚úì Sample embedding (first 10 values): [0.03250689, 0.06084158, -0.16616665, -0.08210022, 0.043314006, -0.025992092, 0.051577497, -0.015190071, -0.0082475105, -0.028356079]

‚ÑπÔ∏è  Each chunk will be converted to a 768-dimensional vector
‚ÑπÔ∏è  All processing happens locally on your machine!


In [8]:
# Create ChromaDB vector store
print(f"Creating ChromaDB vector store from {len(chunks)} chunks...")
print("This may take a minute...\n")

# Set persistent directory
persist_directory = "./chroma_db"

# Create vector store
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name="local_rag_collection"
)

print(f"‚úì ChromaDB vector store created successfully!")
print(f"‚úì Indexed {len(chunks)} document chunks")
print(f"‚úì Stored at: {persist_directory}")
print(f"\n‚ÑπÔ∏è  Vector store persisted to disk - you can reload it later!")

Creating ChromaDB vector store from 49 chunks...
This may take a minute...

‚úì ChromaDB vector store created successfully!
‚úì Indexed 49 document chunks
‚úì Stored at: ./chroma_db

‚ÑπÔ∏è  Vector store persisted to disk - you can reload it later!


In [9]:
# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",    # Use cosine similarity
    search_kwargs={"k": 4}        # Retrieve top 4 most relevant chunks
)

print("‚úì Retriever configured successfully")
print(f"  - Search type: similarity")
print(f"  - Number of documents to retrieve (k): 4")

# Test retrieval
test_query = "What is the main topic of this document?"
print(f"\n--- Retriever Test ---")
print(f"Query: '{test_query}'")

retrieved_docs = retriever.invoke(test_query)

print(f"\nRetrieved {len(retrieved_docs)} documents:")
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Content preview: {doc.page_content[:150]}...")
    print(f"  Source: Page {doc.metadata.get('page', 'N/A')}")

‚úì Retriever configured successfully
  - Search type: similarity
  - Number of documents to retrieve (k): 4

--- Retriever Test ---
Query: 'What is the main topic of this document?'

Retrieved 4 documents:

Document 1:
  Content preview: (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters
remained unchanged from the English-to-German base...
  Source: Page 8

Document 2:
  Content preview: Input-Input Layer5
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
The
...
  Source: Page 13

Document 3:
  Content preview: Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations
for different layer types. n is the sequence length, d...
  Source: Page 5

Document 4:
  Content preview: Attention Visualizations
Input-Input Layer5
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
re.

In [10]:
# Initialize Ollama LLM
llm = ChatOllama(
    model="gemma3:1b",
    temperature=0,          # Deterministic responses (0 = focused, 1 = creative)
    # num_predict=2000,     # Max tokens to generate
    # top_k=40,             # Top-k sampling
    # top_p=0.9,            # Top-p (nucleus) sampling
)

print("‚úì LLM configured successfully")
print(f"  - Model: gemma3:1b (local)")
print(f"  - Temperature: 0 (deterministic)")

# Test LLM
test_response = llm.invoke("Say 'Hello! I am Gemma running locally!'")
print(f"\nLLM Test Response: {test_response.content}")

‚úì LLM configured successfully
  - Model: gemma3:1b (local)
  - Temperature: 0 (deterministic)

LLM Test Response: Hello! I am Gemma running locally! üòä



In [11]:
# Define prompt template
system_prompt = (
    "You are a helpful assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer based on the context, say that you don't know. "
    "Keep the answer concise and accurate.\n\n"
    "Context: {context}\n\n"
    "Question: {question}"
)

prompt = ChatPromptTemplate.from_template(system_prompt)

# Helper function to format documents
def format_docs(docs):
    """Format retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

# Build RAG chain using LCEL
rag_chain = (
    {
        "context": retriever | format_docs,  # Retrieve and format docs
        "question": RunnablePassthrough()      # Pass through the question
    }
    | prompt           # Format with prompt template
    | llm              # Generate answer with local LLM
    | StrOutputParser() # Parse output to string
)

print("‚úì RAG chain created successfully using LCEL!")
print("\nRAG Pipeline Flow:")
print("  1. User provides a query")
print("  2. Retriever finds top 4 relevant chunks (local ChromaDB)")
print("  3. Chunks are formatted as context")
print("  4. Context + question formatted with prompt template")
print("  5. Local LLM (gemma3:1b) generates answer")
print("  6. Answer parsed and returned")
print("\nüîí Everything runs locally on your machine!")

‚úì RAG chain created successfully using LCEL!

RAG Pipeline Flow:
  1. User provides a query
  2. Retriever finds top 4 relevant chunks (local ChromaDB)
  3. Chunks are formatted as context
  4. Context + question formatted with prompt template
  5. Local LLM (gemma3:1b) generates answer
  6. Answer parsed and returned

üîí Everything runs locally on your machine!


In [12]:
# Example Query 1: General question
query1 = "What is the main topic or contribution of this document?"

print(f"Query: {query1}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(query1)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

# Show source documents
print("\nSOURCE DOCUMENTS USED:")
print("=" * 80)
retrieved_docs = retriever.invoke(query1)
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Page: {doc.metadata.get('page', 'N/A')}")
    print(f"  Content: {doc.page_content[:200]}...")
    print("-" * 80)

Query: What is the main topic or contribution of this document?

Processing locally...

ANSWER:
The document discusses the parameters used during the development of the Section 22 development set, specifically focusing on learning rates and beam size.


SOURCE DOCUMENTS USED:

Document 1:
  Page: 8
  Content: (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters
remained unchanged from the English-to-German base translation model. During inference, we
9...
--------------------------------------------------------------------------------

Document 2:
  Page: 12
  Content: Attention Visualizations
Input-Input Layer5
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
registration
or
voting
process
more
difficult
.
<EOS...
--------------------------------------------------------------------------------

Document 3:
  Page: 13
  Content: Input-Input Layer5
The
Law
will
never
be
perfect
,
but
it

In [13]:
# Example Query 2: Specific information extraction
query2 = "Can you summarize the key technical contributions or innovations mentioned?"

print(f"Query: {query2}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(query2)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: Can you summarize the key technical contributions or innovations mentioned?

Processing locally...

ANSWER:
Here‚Äôs a summary of the key technical contributions:

*   **Transformer Model Development:** Ashish, with Illia, designed and implemented the first Transformer models.
*   **Attention Mechanism:** Noam proposed scaled dot-product attention and parameter-free position representation.
*   **Tensor2Tensor:** Niki designed, implemented, tuned, and evaluated numerous model variants.
*   **Position Representation:** Llion experimented with novel model variants.
*   **Training Techniques:** The team used techniques like beam search and parameter averaging.
*   **Evaluation:** The team estimated the number of floating-point operations used for training models.



In [None]:
# # Uncomment to compare embedding models

# print("Comparing embedding models...\n")

# test_query = "What is attention mechanism?"

# # Test with nomic-embed-text
# print("=" * 80)
# print("Using nomic-embed-text:")
# print("=" * 80)
# embeddings_nomic = OllamaEmbeddings(model="nomic-embed-text:v1.5")
# vectorstore_nomic = Chroma.from_documents(
#     documents=chunks[:10],  # Use first 10 chunks for quick test
#     embedding=embeddings_nomic,
#     collection_name="test_nomic"
# )
# retriever_nomic = vectorstore_nomic.as_retriever(search_kwargs={"k": 2})
# docs_nomic = retriever_nomic.invoke(test_query)

# print(f"\nTop retrieved document:")
# print(f"{docs_nomic[0].page_content[:200]}...\n")

# # Test with embeddinggemma
# print("=" * 80)
# print("Using embeddinggemma:")
# print("=" * 80)
# embeddings_gemma = OllamaEmbeddings(model="embeddinggemma:latest")
# vectorstore_gemma = Chroma.from_documents(
#     documents=chunks[:10],
#     embedding=embeddings_gemma,
#     collection_name="test_gemma"
# )
# retriever_gemma = vectorstore_gemma.as_retriever(search_kwargs={"k": 2})
# docs_gemma = retriever_gemma.invoke(test_query)

# print(f"\nTop retrieved document:")
# print(f"{docs_gemma[0].page_content[:200]}...\n")

# print("=" * 80)
# print("\n‚ÑπÔ∏è  Both models perform well. Choose based on your preference!")
# print("   - nomic-embed-text: Smaller (274MB), general-purpose")
# print("   - embeddinggemma: Larger (621MB), optimized for Gemma models")