In [3]:
from dotenv import load_dotenv
import os
from typing import List, Tuple, Dict
import numpy as np

# load dotenv
load_dotenv()
print("Environment Variable loaded successfully.")

Environment Variable loaded successfully.


In [4]:
#sample knowledge base
knowledge_base = [ "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.",
    
    "Deep learning is a type of machine learning that uses neural networks with multiple layers. It's particularly effective for image recognition, natural language processing, and complex pattern recognition tasks.",
    
    "Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language. It enables machines to understand, interpret, and generate human language.",
    
    "Embeddings are numerical representations of text that capture semantic meaning. Similar texts have similar embedding vectors, which enables semantic search and similarity comparison.",
    
    "RAG (Retrieval Augmented Generation) combines information retrieval with text generation. It retrieves relevant context from a knowledge base and uses it to generate more accurate and informed responses.",
    
    "OpenAI's GPT models are large language models trained on diverse internet text. They can perform various tasks like text generation, summarization, translation, and question answering.",
    
    "Vector databases store embeddings and enable fast similarity search. Popular options include Chroma, Pinecone, Weaviate, and FAISS. They're essential for production RAG systems.",
    
    "Fine-tuning is the process of adapting a pre-trained model to a specific task by training it on domain-specific data. It's useful when you need specialized behavior beyond what prompting can achieve."
]

print(f"Knowledge base has {len(knowledge_base)} documents.")
print(f"\nSample Example Document:\n{knowledge_base[0]}")

Knowledge base has 8 documents.

Sample Example Document:
Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.


In [5]:
from sentence_transformers import SentenceTransformer
def create_embeddings(texts:List[str], model: str="all-MiniLM-L6-v2") -> np.ndarray:
    """args:
    texts: List of text data to be embedded
    model: Model name from sentence transformers
    
    Returns:
    NumPy array of embeddings (shape:[num_texts, embedding_dimension])
    """
    model = SentenceTransformer(model)
    embeddings = model.encode(texts)
    return embeddings

# Call function to create embeddings
kb_embeddings = create_embeddings(knowledge_base)

print(f"Created embeddings with shape: {kb_embeddings.shape}")
print(f"Each document is respresented as a {kb_embeddings.shape[1]}-dimensional vector.")


  from .autonotebook import tqdm as notebook_tqdm


Created embeddings with shape: (8, 384)
Each document is respresented as a 384-dimensional vector.


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
def retrieve_relevant_docs(
    query: str,
    knowledge_base: List[str],
    kb_embeddings: np.ndarray,
    top_k: int=2) -> List[Tuple[str, float]]:

    """Args:
    query: User's Question
    knowledge_base: List of documet texts
    kb_embeddings: NumPy array of knowledge base embeddings
    top_k: Number of top relevant documents to retrieve
    """

    # Create embeddings for query
    query_embedding = create_embeddings([query])

    # create similarity score between query and knowledge base
    similarities = cosine_similarity(query_embedding, kb_embeddings)[0] 

    # get top_k indices 
    top_k_indices = np.argsort(similarities)[::-1][:top_k]

    # return document with similarity scores
    results = [(knowledge_base[i], similarities[i]) for i in top_k_indices]
    return results

# call function to retrieve relevant documents
query = "What is the meaning of deep learning?"
relevant_docs = retrieve_relevant_docs(query, knowledge_base, kb_embeddings, top_k=2)

print(f"Query: {query}\n")
for i, (doc, score) in enumerate(relevant_docs, 1):
    print(f"Result {i} (similarity: {score:.4f}):")
    print(f"{doc}\n")

Query: What is the meaning of deep learning?

Result 1 (similarity: 0.7687):
Deep learning is a type of machine learning that uses neural networks with multiple layers. It's particularly effective for image recognition, natural language processing, and complex pattern recognition tasks.

Result 2 (similarity: 0.5162):
Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.



In [11]:
from openai import OpenAI

def generate_answer(
    query: str,
    context_docs: List[Tuple[str, float]],
    model: str = "gpt-4o-mini") -> Dict[str, any]:

    """Args:
    query: User's question
    context_docs: Retrieved documents with similarity scores
    model : OpenAI model name
    
    Returns: 
    Dictionary with answers and metadata
    """

    # instantiate OpenAI client
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

    # Prepare context by concatenating retrieved documents
    context = "\n\n".join([doc for doc, _ in context_docs])

    # Create prompt with context and query
    system_prompt = """You are a helpful AI assistant. Answer the user's question based on the provided context. 
If the context doesn't contain relevant information, say so rather than making up an answer."""

    user_prompt = f"""Context: {context}

    Question: {query}

    Answer based on the above context: """

    # call openai chat completion
    response = client.chat.completions.create(
        model = model,
        messages = [
            {"role": "system" , "content": system_prompt},
            {"role": "user" , "content": user_prompt}
        ],

        temperature = 0.6,
        max_tokens = 300
    )

    return{
        "answer": response.choices[0].message.content,
        "tokens_used": response.usage.total_tokens,
        "sources": [doc for doc, _ in context_docs],
        "similarity_scores":[score for _, score in context_docs]
    }

# call function to generate answer
query = "What is deep learning and what is it used for?"
relevant_docs = retrieve_relevant_docs(query, knowledge_base, kb_embeddings, top_k=3)
result = generate_answer(query, relevant_docs)

print(f"Query: {query}\n")
print(f"Answer:\n{result['answer']}\n")
print(f"Tokens used: {result['tokens_used']}")
print(f"\nSources used (similarity scores):")
for i, (source, score) in enumerate(zip(result['sources'], result['similarity_scores']), 1):
    print(f"{i}. [{score:.3f}] {source[:80]}...")



Query: What is deep learning and what is it used for?

Answer:
Deep learning is a type of machine learning that utilizes neural networks with multiple layers. It is particularly effective for tasks such as image recognition, natural language processing, and complex pattern recognition.

Tokens used: 205

Sources used (similarity scores):
1. [0.825] Deep learning is a type of machine learning that uses neural networks with multi...
2. [0.507] Machine learning is a subset of artificial intelligence that enables computers t...
3. [0.379] Natural Language Processing (NLP) is a field of AI that focuses on the interacti...


# RAG with class instead of Function

In [14]:
class BasicRAG:

    def __init__(
            self,
            embedding_model: str = "all-MiniLM-L6-v2",
            llm_model: str = "gpt-4o-mini",
            top_k: int=3
    ):
        
        self.embedding_model = SentenceTransformer(embedding_model)
        self.llm_model = llm_model
        self.top_k = top_k
        self.openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

        # Store knowledge base and embeddings
        self.documents = []
        self.embeddings = np.ndarray = None

    def add_documents(self, documents:List[str]):
        """Args:
        texts: List of document texts to add to knowledge base
        """
        self.documents = documents
        print(f"Embedding {len(documents)} documents!")
        self.embeddings = self.embedding_model.encode(documents)
        print(f"{len(documents)} documents embedded successfully.")

    def retrieve(self, query: str) -> List[Tuple[str, float]]:
        """
        Retrieve most relevant documents for a query.
        
        Args:
            query: User's question
            
        Returns:
            List of (document, similarity_score) tuples
        """
        if self.embedding is None:
            raise ValueError("Knowledge base is empty. Add documents first., Call add_documents().")
        
        # embed query
        query_embedding = self.embedding_model.encode([query])

        # compute similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]

        # get top_k indices
        top_indices = np.argsort(similarities)[::-1][:self.top_k]

        return [(self.documents[i], similarities[i]) for i in top_indices]
    
    def generate(self, query: str, context_docs: List[Tuple[str, float]]) -> Dict:
        """Generate answer using LLM based on query and context documents.
        
        Args:
            query: User's question
            context_docs: Retrieved documents with similarity scores """
        
        # prepare and join context
        context = "\n\n".join([doc for doc, _ in context_docs])

        system_prompt = """You are a helpful AI assistant. Answer questions based on the provided context.
If the context doesn't contain enough information, acknowledge this limitation."""

        user_prompt = f"""Context: {context}
Question: {query}

Answer based on the above context: """
        
        #call openai chat  to generate answer
        response = self.openai_client.chat.completions.create(
            model = self.llm_model,
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature = 0.7,
            max_tokens = 300
        )

        return {
            "answer": response.choices[0].message.content,
            "tokens_used": response.usage.total_tokens,
            "sources": context_docs
        }
    
    def query(self, question: str, return_sources: bool=True) -> Dict:
        """End-to-end query processing: retrieve relevant documents and generate answer.
        
        Args:
            question: User's question
            return_sources: Whether to include source documents in the output """
        
        # retrieve relevant documents
        relevant_docs = self.retrieve(question)

        # generate answer
        result = self.generate(question, relevant_docs)

        #format output/response
        response = {
            "question": question,
            "answer": result["answer"],
            "tokens_used": result["tokens_used"]

        }

        if return_sources:
            response["sources"] = [
                {"text": doc, "similarity": float(score)}
                for doc, score in result["sources"]
            ]
        
        return response

print("✅ SimpleRAG class defined!")

✅ SimpleRAG class defined!


In [15]:
class SimpleRAG:
    """
    A production-quality RAG system.
    
    This class encapsulates the entire RAG pipeline:
    - Document storage and embedding
    - Semantic search/retrieval
    - Answer generation
    
    Why use a class?
    - Maintains state (knowledge base, embeddings)
    - Provides a clean API (add_documents, query)
    - Easy to configure and reuse
    """
    
    def __init__(
        self,
        embedding_model: str = "all-MiniLM-L6-v2",
        llm_model: str = "gpt-4o-mini",
        top_k: int = 3
    ):
        """
        Initialize the RAG system.
        
        Args:
            embedding_model: Sentence transformer model name
            llm_model: OpenAI model for generation
            top_k: Number of documents to retrieve
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.llm_model = llm_model
        self.top_k = top_k
        self.openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
        
        # State: stores documents and their embeddings
        self.documents: List[str] = []
        self.embeddings: np.ndarray = None
    
    def add_documents(self, documents: List[str]) -> None:
        """
        Add documents to the knowledge base.
        
        Args:
            documents: List of text documents to add
            
        In production:
            - This might load from database/files
            - Could handle incremental updates
            - Might include document metadata
        """
        self.documents = documents
        print(f"Embedding {len(documents)} documents...")
        self.embeddings = self.embedding_model.encode(documents)
        print(f"✅ {len(documents)} documents indexed")
    
    def retrieve(self, query: str) -> List[Tuple[str, float]]:
        """
        Retrieve most relevant documents for a query.
        
        Args:
            query: User's question
            
        Returns:
            List of (document, similarity_score) tuples
        """
        if self.embeddings is None:
            raise ValueError("No documents added. Call add_documents() first.")
        
        # Embed query
        query_embedding = self.embedding_model.encode([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top-k
        top_indices = np.argsort(similarities)[::-1][:self.top_k]
        
        return [(self.documents[i], similarities[i]) for i in top_indices]
    
    def generate(self, query: str, context_docs: List[Tuple[str, float]]) -> Dict:
        """
        Generate answer using retrieved context.
        
        Args:
            query: User's question
            context_docs: Retrieved documents with scores
            
        Returns:
            Dictionary with answer and metadata
        """
        # Build context
        context = "\n\n".join([doc for doc, _ in context_docs])
        
        # Create prompt
        system_prompt = """You are a helpful AI assistant. Answer questions based on the provided context.
If the context doesn't contain enough information, acknowledge this limitation."""
        
        user_prompt = f"""Context:
{context}

Question: {query}

Answer:"""
        
        # Generate
        response = self.openai_client.chat.completions.create(
            model=self.llm_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.7,
            max_tokens=300
        )
        
        return {
            "answer": response.choices[0].message.content,
            "tokens": response.usage.total_tokens,
            "sources": context_docs
        }
    
    def query(self, question: str, return_sources: bool = True) -> Dict:
        """
        Main method: Query the RAG system.
        
        This is the public API - simple to use!
        
        Args:
            question: User's question
            return_sources: Whether to include source documents
            
        Returns:
            Dictionary with answer and optional metadata
        """
        # Step 1: Retrieve
        relevant_docs = self.retrieve(question)
        
        # Step 2: Generate
        result = self.generate(question, relevant_docs)
        
        # Format response
        response = {
            "question": question,
            "answer": result["answer"],
            "tokens_used": result["tokens"]
        }
        
        if return_sources:
            response["sources"] = [
                {"text": doc, "similarity": float(score)}
                for doc, score in result["sources"]
            ]
        
        return response

print("✅ SimpleRAG class defined!!")

✅ SimpleRAG class defined!!


## Using the defined Rag Class

In [16]:
# Initialize RAG system
rag = SimpleRAG(
    embedding_model = "all-MiniLM-L6-v2",
    llm_model = "gpt-4o-mini",
    top_k = 3
)

# add documents to RAG knowledge base
rag.add_documents(knowledge_base)

# query RAG system
result = rag.query("What is RAG and why is it useful?.")

print(f"Question: {result['question']}\n")
print(f"Answer:\n{result['answer']}\n")
print(f"Tokens used: {result['tokens_used']}\n")
print("Sources:")
for i, source in enumerate(result['sources'], 1):
    print(f"{i}. [Similarity: {source['similarity']:.3f}]")
    print(f"   {source['text'][:100]}...\n")

Embedding 8 documents...
✅ 8 documents indexed
Question: What is RAG and why is it useful?.

Answer:
RAG (Retrieval Augmented Generation) is a methodology that combines information retrieval with text generation. It works by retrieving relevant context from a knowledge base and using that information to generate more accurate and informed responses. This is useful because it allows the generation of responses that are not only coherent but also grounded in factual information, enhancing the reliability and relevance of the generated content. By leveraging external knowledge, RAG systems can provide more comprehensive and contextually appropriate answers, making them particularly valuable in applications like customer support, content creation, and question-answering systems.

Tokens used: 272

Sources:
1. [Similarity: 0.621]
   RAG (Retrieval Augmented Generation) combines information retrieval with text generation. It retriev...

2. [Similarity: 0.448]
   Vector databases store embedd

# RAG with different LLM providers

In [17]:
from google import genai
from google.genai import types
from anthropic import Anthropic
class MultiProviderRAG(SimpleRAG):

    def __init__(
        self,
        provider: str="openai",
        embedding_model: str = "all-MiniLM-L6-v2",
        llm_model: str = None,
        top_k: int = 3):

        # set default model per provider
        default_models = {
            "openai": "gpt-4o-mini",
            "google": "gemini-2.5-flash",
            "anthropic": "claude-3-5-haiku-20241022"
        }

        self.provider = provider
        llm_model = llm_model or default_models[provider]

        # Initialize the parent class with the specified parameters
        super().__init__(
            embedding_model,
            llm_model,
            top_k 
        )

        if provider == "gemini":
            self.gemini_client = genai.Client(api_key=os.environ["GOOGLE_GENAI_API_KEY"])
        if provider == "anthropic":
            self.anthropic_client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

        def generate(self, query: str, context_docs: List[Tuple[str, float]]) -> Dict:
            """
        Generate answer using the configured provider.
        
        This overrides the parent method to support multiple providers.
        """
            # prepare the context
            context = "\n\n".join([doc for doc, _ in context_docs])

            if self.provider == "openai":
                return self.generate_openai(query, context)
            if self.provider == "gemini":
                return self.generate_gemini(query, context)
            if self.provider == "anthropic":
                return self.generate_claude(query, context)
            
        def generate_openai(self, query: str, context: str) -> Dict:
            response = self.openai_client.chat.completions.create(
                model = self.llm_model,
                messages = [
                    {"role": "system", "context": "Answer based on context."},
                    {"role": "user", "context": f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"}
                ],
                temperature = 0.7,
                max_tokens = 300
            )
            return {
                "answer": response.choices[0].message.content,
                "tokens": response.usage.total_tokens,
                "sources": context
            }
            
        def generate_gemini(self, query: str, context: str) -> Dict:
            prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer based on the context:"

            response = self.gemini_client.models.generate_content(
                model = self.llm_model,
                contents = prompt,
                config = types.GenerateContentConfig(
                    temperature = 0.7,
                    max_output_tokens = 300
                )
            )
            return {
                "answer": response.text,
                "tokens": response.usage.metadata.total_token_count,
                "sources": context
            }
        
        def _generate_claude(self, query: str, context: str) -> Dict:
            """Generate using Claude."""
            prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer based on the context above:"
        
            response = self.claude_client.messages.create(
                model=self.llm_model,
                max_tokens=300,
                temperature=0.7,
                messages=[{"role": "user", "content": prompt}]
            )
            return {

                "answer": response.content[0].text,
                "tokens": response.usage.input_tokens + response.usage.output_tokens,
                "sources": context
            }
print("✅ MultiProviderRAG class defined")         

✅ MultiProviderRAG class defined


In [18]:
# compare and test different providers
test_questions = "What is deep learning and how is it used?"

providers = ["openai", "google", "anthropic"]

for provider in providers:
    print(f"\n{'='*80}")
    print(f"Testing with {provider.upper()}")
    print('='*80)

    try:
        # Create RAG instance for the provider
        rag = MultiProviderRAG(
            provider=provider,
            top_k=3
        )

        rag.add_documents(knowledge_base)
        result = rag.query(test_questions)

        print(f"\nAnswer:\n{result['answer']}")
        print(f"\nTokens used: {result['tokens_used']}")
        
    except Exception as e:
        print(f"Error with {provider}: {e}")
        print("Make sure you have the API key set in your .env file")


Testing with OPENAI
Embedding 8 documents...
✅ 8 documents indexed

Answer:
Deep learning is a type of machine learning that utilizes neural networks with multiple layers to process data. It is particularly effective in tasks such as image recognition, natural language processing, and complex pattern recognition. By using these multi-layered networks, deep learning can automatically learn and extract features from large amounts of data, leading to improved accuracy and performance in various applications.

Tokens used: 223

Testing with GOOGLE
Embedding 8 documents...
✅ 8 documents indexed
Error with google: Error code: 404 - {'error': {'message': 'The model `gemini-2.5-flash` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Make sure you have the API key set in your .env file

Testing with ANTHROPIC
Error with anthropic: 'ANTHROPIC_API_KEY'
Make sure you have the API key set in your .env file
