In [7]:
!pip uninstall numpy -y
!pip install "numpy<2.0"

Found existing installation: numpy 2.3.5
Uninstalling numpy-2.3.5:
  Successfully uninstalled numpy-2.3.5
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m7.1 MB/s[0m  [33m0:00:01[0m7.2 MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.26.4


In [5]:
!pip install chromadb==0.4.22



In [1]:
import chromadb
print("✓ ChromaDB imported successfully!")

✓ ChromaDB imported successfully!


# Demo Notebook: Embeddings & Vector Search (Task 2)
This notebook demonstrates dataset loading, chunking, embedding, storing, and similarity search.

In [2]:
import sys
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
import faiss

## Step 1: Load Dataset

In [3]:
try:
    dataset_text = """
Retrieval-Augmented Generation (RAG) significantly improves the accuracy and reliability of language models by grounding their answers in external knowledge sources.
Traditional language models often hallucinate or confidently produce incorrect information because they cannot verify facts or access new knowledge.
RAG solves this by connecting retrieval systems with generative models.
First, relevant documents are retrieved using similarity search techniques.
Then, the retrieved text is inserted into the prompt, allowing the model to generate answers based on real context instead of guessing.
This approach is used in systems like support chatbots, academic research assistants, knowledge search tools, and customer service AI.
It enables models to answer domain-specific questions such as university procedures, medical guidelines, or technical documentation.
With embeddings and vector search, we can find semantically similar text even if exact wording differs.
Therefore, chunking, embedding, storage, and cosine similarity are essential building blocks for a working RAG pipeline.

When writing queries for RAG systems, it is important to:
- Be clear and concise
- Use domain-specific keywords
- Include context when possible
- Avoid vague pronouns
Effective query design improves retrieval quality and reduces the chance of irrelevant results.

The pipeline often involve:
1. Preprocessing datasets
2. Chunking text into meaningful pieces
3. Creating embeddings for each chunk
4. Storing embeddings in a vector database
5. Computing similarity between query and chunks
6. Retrieving top-K chunks for LLM input

RAG reduces hallucination by grounding LLM responses in retrieved context.
Using retrieval-augmented generation, language models are less likely to hallucinate because they base answers on real text.
"""

    print("Step 1: Dataset loaded successfully.\n")

except Exception as e:
    print("Step 1 failed:", e)
    sys.exit(1)


Step 1: Dataset loaded successfully.



##Step 2: Chunk the text

In [4]:
try:
    # Split into lines, remove empty
    lines = [line.strip() for line in dataset_text.splitlines() if line.strip()]
    
    # Merge 2–3 sentences per chunk, keep numbered lists intact
    chunks = []
    current_chunk = ""

    for line in lines:
        # Handle numbered list items as separate chunks
        if line.strip().startswith(tuple(f"{i}." for i in range(1, 20))):
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
            chunks.append(line.strip())
        else:
            # Split line into sentences
            sentences = re.split(r'(?<=[.!?]) +', line)
            for s in sentences:
                if current_chunk:
                    current_chunk += " " + s
                else:
                    current_chunk = s
                if current_chunk.count('.') >= 2:  # 2 sentences per chunk
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
    # Append leftover
    if current_chunk:
        chunks.append(current_chunk.strip())

    print(f"Step 2: Created {len(chunks)} meaningful chunks.\n")
    for i, c in enumerate(chunks):
        print(f"Chunk {i+1}:\n{c}\n")

except Exception as e:
    print("Step 2 failed:", e)
    sys.exit(1)

Step 2: Created 13 meaningful chunks.

Chunk 1:
Retrieval-Augmented Generation (RAG) significantly improves the accuracy and reliability of language models by grounding their answers in external knowledge sources. Traditional language models often hallucinate or confidently produce incorrect information because they cannot verify facts or access new knowledge.

Chunk 2:
RAG solves this by connecting retrieval systems with generative models. First, relevant documents are retrieved using similarity search techniques.

Chunk 3:
Then, the retrieved text is inserted into the prompt, allowing the model to generate answers based on real context instead of guessing. This approach is used in systems like support chatbots, academic research assistants, knowledge search tools, and customer service AI.

Chunk 4:
It enables models to answer domain-specific questions such as university procedures, medical guidelines, or technical documentation. With embeddings and vector search, we can find semantic

## Step 3: Generate embeddings (requires sentence-transformers package)

In [5]:
try:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Step 3: Sentence Transformer model loaded.\n")
    
    chunk_embeddings = model.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
    print(f"Step 3: Embeddings generated for {len(chunks)} chunks.\n")

except Exception as e:
    print("Step 3 failed:", e)
    sys.exit(1)

Step 3: Sentence Transformer model loaded.

Step 3: Embeddings generated for 13 chunks.



## Step 3:FIASS

In [6]:
try:
    dimension = chunk_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  
    index.add(chunk_embeddings)
    print(f"Step 4: FAISS index created and {len(chunks)} embeddings added.\n")

except Exception as e:
    print("Step 4 failed:", e)
    sys.exit(1)

Step 4: FAISS index created and 13 embeddings added.



## Step 5:Query

In [7]:
try:
    query = "How does RAG reduce hallucination?"
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    print(f"Step 5: Query transformed into embedding: '{query}'\n")

except Exception as e:
    print("Step 5 failed:", e)
    sys.exit(1)


Step 5: Query transformed into embedding: 'How does RAG reduce hallucination?'



## Step 6: Text retrived

In [8]:
try:
    top_k = 5
    threshold = 0.1  # Minimum cosine similarity

    D, I = index.search(query_embedding, top_k)
    retrieved_chunks = []
    for score, idx in zip(D[0], I[0]):
        if score >= threshold:
            retrieved_chunks.append((score, chunks[idx]))

    if not retrieved_chunks:
        print(f"Query: {query}\n\nNo relevant chunks found for this query.\n")
    else:
        print(f"Query: {query}\n\nRetrieved Top-{len(retrieved_chunks)} Chunks:\n")
        for i, (score, chunk) in enumerate(retrieved_chunks):
            print(f"Result {i+1} (score={score:.3f}):\n{chunk}\n")

except Exception as e:
    print("Step 6 failed:", e)
    sys.exit(1)

Query: How does RAG reduce hallucination?

Retrieved Top-4 Chunks:

Result 1 (score=0.529):
RAG reduces hallucination by grounding LLM responses in retrieved context. Using retrieval-augmented generation, language models are less likely to hallucinate because they base answers on real text.

Result 2 (score=0.264):
Therefore, chunking, embedding, storage, and cosine similarity are essential building blocks for a working RAG pipeline. When writing queries for RAG systems, it is important to: - Be clear and concise - Use domain-specific keywords - Include context when possible - Avoid vague pronouns Effective query design improves retrieval quality and reduces the chance of irrelevant results.

Result 3 (score=0.224):
Retrieval-Augmented Generation (RAG) significantly improves the accuracy and reliability of language models by grounding their answers in external knowledge sources. Traditional language models often hallucinate or confidently produce incorrect information because they cann

In [9]:
import chromadb
from sentence_transformers import SentenceTransformer

class VectorDB:
    def __init__(self, collection_name="rag_documents"):
        self.client = chromadb.Client()
        
        # Delete collection if exists (for clean testing)
        try:
            self.client.delete_collection(name=collection_name)
        except:
            pass
        
        self.collection = self.client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        print(f"✓ VectorDB initialized with ChromaDB collection: {collection_name}")

    def add_docs(self, chunks, metadata=None):
        if not metadata:
            metadata = [{'source': 'unknown', 'chunk_id': i} for i in range(len(chunks))]
        
        # Generate embeddings
        embeddings = self.model.encode(chunks, convert_to_numpy=True)
        ids = [f"chunk_{i}" for i in range(len(chunks))]
        
        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=chunks,
            metadatas=metadata,
            ids=ids
        )
        
        print(f"✓ Added {len(chunks)} documents to ChromaDB")
        return True

    def search(self, query, top_k=5):
        query_embedding = self.model.encode([query], convert_to_numpy=True)
        
        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=top_k
        )
        
        formatted_results = []
        for i in range(len(results['documents'][0])):
            formatted_results.append({
                'chunk': results['documents'][0][i],
                'score': 1 - results['distances'][0][i],
                'metadata': results['metadatas'][0][i]
            })
        
        return formatted_results

    def get_stats(self):
        return {
            'total_chunks': self.collection.count(),
            'model': 'all-MiniLM-L6-v2',
            'database': 'ChromaDB'
        }

In [12]:
db = VectorDB()
db.add_docs(chunks, metadata=[{'source': 'test.txt', 'page': i} for i in range(len(chunks))])
results = db.search("How does RAG work?")
print(results[0])

✓ VectorDB initialized with ChromaDB collection: rag_documents
✓ Added 13 documents to ChromaDB
{'chunk': 'Therefore, chunking, embedding, storage, and cosine similarity are essential building blocks for a working RAG pipeline. When writing queries for RAG systems, it is important to: - Be clear and concise - Use domain-specific keywords - Include context when possible - Avoid vague pronouns Effective query design improves retrieval quality and reduces the chance of irrelevant results.', 'score': 0.44305121898651123, 'metadata': {'page': 4, 'source': 'test.txt'}}
