### Faiss Vector Storage

In [3]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

print(f"FAISS version: {faiss.__version__}")
print("‚úÖ Libraries imported successfully!")

FAISS version: 1.13.1
‚úÖ Libraries imported successfully!


### Prepare Sample Data

In [2]:
# Sample documents
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks.",
    "Data visualization helps communicate insights from complex datasets.",
    "Cloud computing provides on-demand access to computing resources.",
    "Cybersecurity protects systems and networks from digital attacks.",
    "Blockchain technology enables secure, decentralized transactions.",
    "Quantum computing uses quantum mechanics to solve complex problems."
]

print(f"Total documents: {len(documents)}")

Total documents: 10


### Generate Embeddings

In [5]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(documents)

print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {embeddings.shape[1]} dimensions")
print(f"Embeddings shape: {embeddings.shape}")

Generated 10 embeddings
Each embedding has 384 dimensions
Embeddings shape: (10, 384)


### Create FAISS Index

In [7]:
# Get embeddings dimension
dimension = embeddings.shape[1]

# Create FAISS index (IndexFlatL2 = exact search with L2 distance)
index = faiss.IndexFlatL2(dimension)

# Add embeddings to index
index.add(embeddings)

print(f"‚úÖ FAISS index created!")
print(f"Total vectors in index: {index.ntotal}")

‚úÖ FAISS index created!
Total vectors in index: 10


### Search with FAISS

In [8]:
# Query
query = "WHat is artificial intelligence and machine learning"

# Embed query
query_embedding = model.encode([query])

# Search: find top 3 most similar vectors
k = 3
distances, indices = index.search(query_embedding, k)

print(f"Query: {query}\n")
print(f"Top {k} results:\n")

for i, (idx, distance) in enumerate(zip(indices[0], distances[0]), 1):
  print(f"{i}. (distance: {distance:.4f})")
  print(f"  {documents[idx]}")
  print()


Query: WHat is artificial intelligence and machine learning

Top 3 results:

1. (distance: 0.9125)
  Deep learning is a subset of machine learning using multi-layered neural networks.

2. (distance: 1.1951)
  Machine learning models require large amounts of training data to perform well.

3. (distance: 1.2401)
  Natural language processing enables computers to understand human language.



### Using Cosine SImilarity with FAISS

In [9]:
# Normalize embeddings for cosine similarity
embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create index with inner product (equivalent to cosine for normalized vectors)
index_cosine = faiss.IndexFlatIP(dimension)
index_cosine.add(embeddings_normalized)

# Search with normalized query
query_embedding_normalized = query_embedding / np.linalg.norm(query_embedding)
scores, indices = index_cosine.search(query_embedding_normalized, k=3)

print(f"Query: {query}\n")
print(f"Top {k} results with cosine similarity:\n")

for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    print(f"{i}. (Similarity: {score:.4f})")
    print(f"   {documents[idx]}")
    print()

Query: WHat is artificial intelligence and machine learning

Top 3 results with cosine similarity:

1. (Similarity: 0.5437)
   Deep learning is a subset of machine learning using multi-layered neural networks.

2. (Similarity: 0.4025)
   Machine learning models require large amounts of training data to perform well.

3. (Similarity: 0.3799)
   Natural language processing enables computers to understand human language.



### Saving and Loading FAISS Index

In [10]:
# Save index to disk
faiss.write_index(index_cosine, "my_faiss_index.bin")
print("‚úÖ Index saved to disk")

# Save documents seperately (FAISS only store vectors, not text)
import pickle
with open("documents.pkl", 'wb') as f:
  pickle.dump(documents, f)
print("Documents saved")

‚úÖ Index saved to disk
Documents saved


In [11]:
# Load index from disk
loaded_index = faiss.read_index("my_faiss_index.bin")
print(f"Index loaded: {loaded_index.ntotal} vectors")

# Load documents
with open("documents.pkl", "rb") as f:
  loaded_documents = pickle.load(f)
print(f"Documents loaded: {len(loaded_documents)} documents")

Index loaded: 10 vectors
Documents loaded: 10 documents


### Chroma Vector Database

In [12]:
import chromadb

print(f"ChromaDB version: {chromadb.__version__}")
print("‚úÖ ChromaDB imported successfully!")

ChromaDB version: 1.3.6
‚úÖ ChromaDB imported successfully!


In [15]:
# Create Chroma client (persistent storage)
# Note: ChromaDB 0.4.0+ uses PersistentClient instead of Client(Settings(...))
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(
    name="my_documents",
    metadata={"description": "Sample document collection"}
)

print(f"‚úÖ Collection created: {collection.name}")
print(f"Current count: {collection.count()} documents")
print(f"üìÅ Data persisted to: ./chroma_db/")

‚úÖ Collection created: my_documents
Current count: 0 documents
üìÅ Data persisted to: ./chroma_db/


In [None]:
# Sample documents with metadata
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks."
]

# Metadata for each document
metadatas = [
  {"category": "programming", "topic": "python"},
  {"category": "AI", "topic": "machine learning"},
  {"category": "AI", "topic": "neural networks"},
  {"category": "AI", "topic": "NLP"},
  {"category": "AI", "topic": "deep learning"},
  
]

# IDs for each document
ids = [f"doc_{i}" for i in range(len(documents))]

# Add to collection (Chroma handles embeding automatically!)
collection.add(
  documents=documents,
  metadatas=metadatas,
  ids=ids
)

print(f"‚úÖ Added {len(documents)} documents to collection")
print(f"Total documents: {collection.count()}")


In [None]:
# Query the collection
results = collection.query(
  query_texts=["What is artificial intelligence"],
  n_results=3
)

In [None]:
results

In [None]:
# Query the collection
results = collection.query(
    query_texts=["What is artificial intelligence?"],
    n_results=3
)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

In [None]:
for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

In [None]:
results['documents']

### Filtering with Metadata

In [None]:
# QUery the metadata filter
results = collection.query(
  query_texts=["Tell me about AI"],
  n_results=3,
  where={"category":"AI"} 
)

print("Query: Tell me about AI (filtered by category='AI')\n")
print("Results:\n")

for i, (doc, metadata) in enumerate(zip(
  results['documents'][0],
  results['metadatas'][0]
), 1):
  print(f"{i}. {doc}")
  print(f"    Category: {metadata['category']}, Topic: {metadata['topic']}")
  print()

### Using Custom Embedding Function

In [None]:
from chromadb.utils import embedding_functions

# Use sentence-transformers embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
  model_name="all-MiniLM-L6-v2"
)

# Create new collection with custom embedding function
collection_custom = client.get_or_create_collection(
  name="custom_embeddings",
  embedding_function=sentence_transformer_ef
)

# Add documents
collection_custoom.add(
  documents=documents,
  metadatas=metadatas,
  ids=ids
)

print(f"‚úîÔ∏èCollection with custom embeddings created")
print(f"Documents: {collection_custom.count()}")

In [None]:
# Query the collection
results = collection_custom.query(
    query_texts=["What is artificial intelligence?"],
    n_results=3,
    include=["embeddings", "documents", "metadatas", "distances"]
)

In [None]:
results

In [None]:
# Query the collection
results = collection_custom.query(
    query_texts=["What is artificial intelligence?"],
    n_results=3,
    include=["embeddings", "documents", "metadatas", "distances"]
)

print("Query: What is artificial intelligence?\n")
print("Top 3 results:\n")

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"{i}. (Distance: {distance:.4f})")
    print(f"   Document: {doc}")
    print(f"   Metadata: {metadata}")
    print()

### Update and Delete Documents

In [None]:
# Update a document
collection.update(
    ids=["doc_0"],
    documents=["Python is an amazing programming language for AI and data science!"],
    metadatas=[{"category": "programming", "topic": "python", "updated": True}]
)
print("‚úÖ Document updated")

# Delete a document
# collection.delete(ids=["doc_4"])
# print("‚úÖ Document deleted")

print(f"\nTotal documents after update: {collection.count()}")

### Building a complete RAG Retriever

### RAG Retriever with Chroma

In [None]:
import re

class RAGRetriever:
    def __init__(self, collection_name="rag_collection", persist_dir="./rag_db"):
        """
        Initialize RAG retriever with Chroma.
        """
        # Create Chroma client (using PersistentClient for ChromaDB 0.4.0+)
        self.client = chromadb.PersistentClient(path=persist_dir)
        
        # Create collection with sentence-transformers
        embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
        
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_fn
        )
        
        print(f"‚úÖ RAG Retriever initialized")
        print(f"Collection: {collection_name}")
        print(f"Current documents: {self.collection.count()}")
        print(f"üìÅ Data persisted to: {persist_dir}/")
    
    def chunk_text(self, text, chunk_size=500):
        """
        Simple sentence-based chunking from Module 2.
        """
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += " " + sentence if current_chunk else sentence
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def add_document(self, text, metadata=None, source_name="unknown"):
        """
        Add a document (chunks it automatically).
        """
        # Chunk the document
        chunks = self.chunk_text(text)
        
        # Prepare data for Chroma
        ids = [f"{source_name}_chunk_{i}" for i in range(len(chunks))]
        metadatas = [
            {
                "source": source_name,
                "chunk_index": i,
                "total_chunks": len(chunks),
                **(metadata or {})
            }
            for i in range(len(chunks))
        ]
        
        # Add to collection
        self.collection.add(
            documents=chunks,
            metadatas=metadatas,
            ids=ids
        )
        
        print(f"‚úÖ Added document '{source_name}': {len(chunks)} chunks")
        return len(chunks)
    
    def retrieve(self, query, top_k=3, filter_metadata=None):
        """
        Retrieve relevant chunks for a query.
        """
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k,
            where=filter_metadata
        )
        
        return {
            'documents': results['documents'][0],
            'metadatas': results['metadatas'][0],
            'distances': results['distances'][0]
        }
    
    def format_context(self, retrieved_results):
        """
        Format retrieved chunks for LLM prompt.
        """
        context = "Context from retrieved documents:\n\n"
        
        for i, (doc, metadata, distance) in enumerate(zip(
            retrieved_results['documents'],
            retrieved_results['metadatas'],
            retrieved_results['distances']
        ), 1):
            source = metadata.get('source', 'unknown')
            context += f"[{i}] From {source} (Relevance: {1/(1+distance):.3f}):\n"
            context += f"{doc}\n\n"
        
        return context

print("‚úÖ RAGRetriever class defined!")

### Test the RAG Retriever

In [None]:
# Create retriever
retriever = RAGRetriever(collection_name="test_rag")

# Add sample documents
doc1 = """
Machine learning is a branch of artificial intelligence that focuses on building systems 
that can learn from data. These systems improve their performance over time without being 
explicitly programmed. Common applications include image recognition, natural language 
processing, and recommendation systems.
"""

doc2 = """
Python is a high-level programming language known for its simplicity and readability. 
It's widely used in web development, data science, automation, and artificial intelligence. 
Python's extensive library ecosystem makes it ideal for rapid development.
"""

doc3 = """
Vector databases are specialized databases designed to store and query high-dimensional 
vectors efficiently. They're essential for modern AI applications like semantic search, 
recommendation systems, and retrieval-augmented generation (RAG). Popular examples include 
FAISS, Pinecone, and Chroma.
"""

# Add documents
retriever.add_document(doc1, metadata={"category": "AI"}, source_name="ml_intro.txt")
retriever.add_document(doc2, metadata={"category": "programming"}, source_name="python_guide.txt")
retriever.add_document(doc3, metadata={"category": "databases"}, source_name="vector_db_overview.txt")

In [None]:
# Test query
query = "What are vector databases used for?"

results = retriever.retrieve(query, top_k=3)

print(f"Query: {query}\n")
print("="*80)
print(retriever.format_context(results))