In [1]:

from langchain_ollama.llms import OllamaLLM
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

print("="*60)
print("Q&A OVER DOCUMENTS TUTORIAL")
print("="*60 + "\n")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Q&A OVER DOCUMENTS TUTORIAL



In [2]:
print("PART 1: Creating Sample Documents\n")

# Create sample documents for testing
sample_text = """
Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991.

Python is used in many fields:
- Web Development: Django, Flask, FastAPI
- Data Science: Pandas, NumPy, Scikit-learn
- Machine Learning: TensorFlow, PyTorch, Keras
- Automation: Scripts for system administration
- Scientific Computing: SciPy, Matplotlib

Key features of Python:
1. Easy to learn and read
2. Dynamically typed
3. Has a large standard library
4. Supports multiple programming paradigms (OOP, functional, procedural)
5. Cross-platform compatibility

Python's popularity has grown exponentially since 2010.
Major companies like Google, Netflix, Spotify, and Instagram use Python.

The Python community is very active and welcoming to beginners.
There are thousands of libraries available through PyPI (Python Package Index).

Python 3 is the current version, released in 2008.
Python 2 reached end-of-life in 2020.
"""

# Save to a text file
with open("sample_document.txt", "w") as f:
    f.write(sample_text)

print("✓ Created sample_document.txt\n")

print("="*60 + "\n")



PART 1: Creating Sample Documents

✓ Created sample_document.txt




In [4]:
print("PART 2: Document Loaders (Reading Documents)\n")

loader = TextLoader("sample_document.txt")
documents = loader.load()
print(f"Loaded {len(documents)} document(s)")
print(f"First document preview:")
print(documents[0].page_content[:200] + "...\n")

print("="*60 + "\n")

PART 2: Document Loaders (Reading Documents)

Loaded 1 document(s)
First document preview:

Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991.

Python is used in many fields:
- Web...




In [5]:
print("PART 3: Text Splitting (Break into Chunks)\n")

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,          # Size of each chunk
    chunk_overlap=100,       # Overlap between chunks (for context)
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks")
print(f"\nChunk 1 preview:")
print(chunks[0].page_content)
print(f"\nChunk 2 preview:")
print(chunks[1].page_content[:200] + "...\n")

print("="*60 + "\n")



PART 3: Text Splitting (Break into Chunks)

Split into 3 chunks

Chunk 1 preview:
Python is a high-level, interpreted programming language known for its simplicity and readability.
It was created by Guido van Rossum and first released in 1991.

Python is used in many fields:
- Web Development: Django, Flask, FastAPI
- Data Science: Pandas, NumPy, Scikit-learn
- Machine Learning: TensorFlow, PyTorch, Keras
- Automation: Scripts for system administration
- Scientific Computing: SciPy, Matplotlib

Chunk 2 preview:
Key features of Python:
1. Easy to learn and read
2. Dynamically typed
3. Has a large standard library
4. Supports multiple programming paradigms (OOP, functional, procedural)
5. Cross-platform compat...




In [6]:
print("PART 4: Embeddings (Convert Text to Vectors)\n")

# Create embeddings using Ollama
embeddings = OllamaEmbeddings(model="llama3")

# Test embedding a single piece of text
test_embedding = embeddings.embed_query("What is Python?")
print(f"Embedding created!")
print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}\n")

print("💡 Embeddings convert text into numbers (vectors)")
print("💡 Similar text = similar vectors")
print("💡 Used for semantic search\n")

print("="*60 + "\n")


PART 4: Embeddings (Convert Text to Vectors)

Embedding created!
Embedding dimension: 4096
First 5 values: [-0.009111718, -0.025767617, 0.022988915, 0.010610171, -0.01611066]

💡 Embeddings convert text into numbers (vectors)
💡 Similar text = similar vectors
💡 Used for semantic search




In [9]:
print("PART 5: Vector Store (Store & Search Embeddings)\n")

# Create vector store
vector_store = FAISS.from_documents(chunks, embeddings)

print("✓ Vector store created with FAISS")
print(f"✓ Stored {len(chunks)} chunks\n")

# Test semantic search
query = "What is Python used for?"
similar_docs = vector_store.similarity_search(query, k=2)

print(f"Query: '{query}'")
print(f"\nFound {len(similar_docs)} similar documents:\n")

for i, doc in enumerate(similar_docs, 1):
    print(f"Document {i}:")
    print(doc.page_content[:200] + "...\n")

print("="*60 + "\n")


PART 5: Vector Store (Store & Search Embeddings)

✓ Vector store created with FAISS
✓ Stored 3 chunks

Query: 'What is Python used for?'

Found 2 similar documents:

Document 1:
Key features of Python:
1. Easy to learn and read
2. Dynamically typed
3. Has a large standard library
4. Supports multiple programming paradigms (OOP, functional, procedural)
5. Cross-platform compat...

Document 2:
Python 3 is the current version, released in 2008.
Python 2 reached end-of-life in 2020....




In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m809.9 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [11]:

print("PART 6: Retrieval QA Chain (Ask Questions!)\n")

# Initialize LLM
llm = OllamaLLM(model="llama3", temperature=0.7)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Simple method: stuff all retrieved docs into prompt
    retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
    verbose=True
)

# Ask questions
questions = [
    "What is Python?",
    "What companies use Python?",
    "When was Python created?",
    "What are the key features of Python?"
]

print("Asking questions about the document:\n")

for question in questions:
    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    
    answer = qa_chain.run(question)
    print(f"Answer: {answer}\n")


print("="*60 + "\n")



PART 6: Retrieval QA Chain (Ask Questions!)

Asking questions about the document:


Question: What is Python?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: Based on the provided context, a helpful answer to this question would be:

Python is a programming language that is known for its ease of learning and reading, dynamic typing, large standard library, support for multiple programming paradigms (OOP, functional, procedural), and cross-platform compatibility.


Question: What companies use Python?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: According to the context, major companies like Google, Netflix, Spotify, and Instagram use Python.


Question: When was Python created?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: I don't know. The context only mentions that Python 3 was released in 2008, but it doesn't provide information on when the original version of Python was created o

In [13]:
print("PART 7: Custom Prompts for QA\n")

# Create a custom prompt
custom_prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer in a helpful and concise way:"""

CUSTOM_PROMPT = PromptTemplate(
    template=custom_prompt_template,
    input_variables=["context", "question"]
)

# Create QA chain with custom prompt
from langchain.chains.question_answering import load_qa_chain

custom_qa_chain = load_qa_chain(
    llm=llm,
    chain_type="stuff",
    prompt=CUSTOM_PROMPT,
    verbose=False
)

# Test it
print("Using custom prompt:\n")

query = "List the main features of Python"
relevant_docs = vector_store.similarity_search(query, k=2)

result = custom_qa_chain.run(
    input_documents=relevant_docs,
    question=query
)

print(f"Question: {query}")
print(f"Answer: {result}\n")

print("="*60 + "\n")

PART 7: Custom Prompts for QA

Using custom prompt:



stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  custom_qa_chain = load_qa_chain(


Question: List the main features of Python
Answer: Based on the provided context, the main features of Python are:

1. Easy to learn and read
2. Dynamically typed
3. Has a large standard library
4. Supports multiple programming paradigms (OOP, functional, procedural)
5. Cross-platform compatibility

These features make Python a popular language for various applications, including web development, data science, machine learning, automation, and scientific computing.




In [14]:

print("PART 8: Different Chain Types\n")

print("""
Chain types for QA:

1. STUFF (Simple - Default)
   - Takes all retrieved docs and stuffs them into the prompt
   - Fast but limited by context window
   - Best for: Small documents, quick answers

2. MAP_REDUCE
   - Maps each document through LLM independently
   - Reduces results into final answer
   - Best for: Large documents, multiple documents

3. REFINE
   - Iteratively refines the answer by going through docs
   - Builds on previous answers
   - Best for: Deep analysis, detailed answers

4. MAP_RERANK
   - Maps documents and ranks them by score
   - Selects top documents
   - Best for: Precise answers from many documents
""")

# Example: MAP_REDUCE chain
print("Creating MAP_REDUCE chain...\n")

mapreduce_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    verbose=False
)

question = "Summarize what Python is and its uses"
print(f"Question: {question}")
answer = mapreduce_qa.run(question)
print(f"Answer: {answer}\n")

print("="*60 + "\n")


PART 8: Different Chain Types


Chain types for QA:

1. STUFF (Simple - Default)
   - Takes all retrieved docs and stuffs them into the prompt
   - Fast but limited by context window
   - Best for: Small documents, quick answers

2. MAP_REDUCE
   - Maps each document through LLM independently
   - Reduces results into final answer
   - Best for: Large documents, multiple documents

3. REFINE
   - Iteratively refines the answer by going through docs
   - Builds on previous answers
   - Best for: Deep analysis, detailed answers

4. MAP_RERANK
   - Maps documents and ranks them by score
   - Selects top documents
   - Best for: Precise answers from many documents

Creating MAP_REDUCE chain...

Question: Summarize what Python is and its uses
Answer: I don't know the answer. The provided portion of the document does not contain any information about what Python is and its uses, other than a partial sentence starting with "Python...". To provide a complete summary, more context would be need