In [7]:
from dotenv import load_dotenv
import os
from typing import List, Tuple, Dict
import numpy as np

# load dotenv
load_dotenv()
print("Environment Variable loaded successfully.")

Environment Variable loaded successfully.


In [8]:
#sample knowledge base
knowledge_base = [ "Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.",
    
    "Deep learning is a type of machine learning that uses neural networks with multiple layers. It's particularly effective for image recognition, natural language processing, and complex pattern recognition tasks.",
    
    "Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language. It enables machines to understand, interpret, and generate human language.",
    
    "Embeddings are numerical representations of text that capture semantic meaning. Similar texts have similar embedding vectors, which enables semantic search and similarity comparison.",
    
    "RAG (Retrieval Augmented Generation) combines information retrieval with text generation. It retrieves relevant context from a knowledge base and uses it to generate more accurate and informed responses.",
    
    "OpenAI's GPT models are large language models trained on diverse internet text. They can perform various tasks like text generation, summarization, translation, and question answering.",
    
    "Vector databases store embeddings and enable fast similarity search. Popular options include Chroma, Pinecone, Weaviate, and FAISS. They're essential for production RAG systems.",
    
    "Fine-tuning is the process of adapting a pre-trained model to a specific task by training it on domain-specific data. It's useful when you need specialized behavior beyond what prompting can achieve."
]

print(f"Knowledge base has {len(knowledge_base)} documents.")
print(f"\nSample Example Document:\n{knowledge_base[0]}")

Knowledge base has 8 documents.

Sample Example Document:
Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.


In [9]:
from sentence_transformers import SentenceTransformer
def create_embeddings(texts:List[str], model: str="all-MiniLM-L6-v2") -> np.ndarray:
    """args:
    texts: List of text data to be embedded
    model: Model name from sentence transformers
    
    Returns:
    NumPy array of embeddings (shape:[num_texts, embedding_dimension])
    """
    model = SentenceTransformer(model)
    embeddings = model.encode(texts)
    return embeddings

# Call function to create embeddings
kb_embeddings = create_embeddings(knowledge_base)

print(f"Created embeddings with shape: {kb_embeddings.shape}")
print(f"Each document is respresented as a {kb_embeddings.shape[1]}-dimensional vector.")


Created embeddings with shape: (8, 384)
Each document is respresented as a 384-dimensional vector.


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
def retrieve_relevant_docs(
    query: str,
    knowledge_base: List[str],
    kb_embeddings: np.ndarray,
    top_k: int=3) -> List[Tuple[str, float]]:

    """Args:
    query: User's Question
    knowledge_base: List of documet texts
    kb_embeddings: NumPy array of knowledge base embeddings
    top_k: Number of top relevant documents to retrieve
    """

    # Create embeddings for query
    query_embedding = create_embeddings([query])

    # create similarity score between query and knowledge base
    similarities = cosine_similarity(query_embedding, kb_embeddings)[0] 

    # get top_k indices 
    top_k_indices = np.argsort(similarities)[::-1][:top_k]

    # return document with similarity scores
    results = [(knowledge_base[i], similarities[i]) for i in top_k_indices]
    return results

# call function to retrieve relevant documents
query = "What is the meaning of deep learning?"
relevant_docs = retrieve_relevant_docs(query, knowledge_base, kb_embeddings, top_k=3)

print(f"Query: {query}\n")
for i, (doc, score) in enumerate(relevant_docs, 1):
    print(f"Result {i} (similarity: {score:.4f}):")
    print(f"{doc}\n")

Query: What is the meaning of deep learning?

Result 1 (similarity: 0.7687):
Deep learning is a type of machine learning that uses neural networks with multiple layers. It's particularly effective for image recognition, natural language processing, and complex pattern recognition tasks.

Result 2 (similarity: 0.5162):
Machine learning is a subset of artificial intelligence that enables computers to learn from data without explicit programming. It uses algorithms to identify patterns and make predictions.

Result 3 (similarity: 0.3774):
Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language. It enables machines to understand, interpret, and generate human language.



In [None]:
from openai import OpenAI

def generate_answer(
    query: str,
    context_docs: List[Tuple[str, float]],
    model: str = "gpt-4o-mini") -> Dict[str, any]:

    """Args:
    query: User's question
    context_docs: Retrieved documents with similarity scores
    model : OpenAI model name
    
    Returns: 
    Dictionary with answers and metadata
    """

    # instantiate OpenAI client
    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

    # Prepare context by concatenating retrieved documents
    context = "\n\n".join([doc for doc, _ in context_docs])

    # Create prompt with context and query
    system_prompt = """You are a helpful AI assistant. Answer the user's question based on the provided context. 
If the context doesn't contain relevant information, say so rather than making up an answer."""

    user_prompt = f"""Context: {context}

    Question: {query}

    Answer based on the above context: """

    # call openai chat completion
    response = client.chat.completions.create(
        model = model,
        messages = [
            {"role": "system" , "content": system_prompt},
            {"role": "user" , "content": user_prompt}
        ],

        temperature = 0.6,
        max_tokens = 300
    )

    return{
        "answer": response.choices[0].message.content,
        "tokens_used": response.usage.total_tokens,
        "sources": [doc for doc, _ in context_docs],
        "similarity_scores":[score for _, score in context_docs]
    }

# call function to generate answer
query = "What is deep learning and what is it used for?"
relevant_docs = retrieve_relevant_docs(query, knowledge_base, kb_embeddings, top_k=3)
result = generate_answer(query, relevant_docs)

print(f"Query: {query}\n")
print(f"Answer:\n{result['answer']}\n")
print(f"Tokens used: {result['tokens_used']}")
print(f"\nSources used (similarity scores):")
for i, (source, score) in enumerate(zip(result['sources'], result['similarity_scores']), 1):
    print(f"{i}. [{score:.3f}] {source[:80]}...")



ValueError: could not convert string to float: "Deep learning is a type of machine learning that uses neural networks with multiple layers. It's particularly effective for image recognition, natural language processing, and complex pattern recognition tasks."