In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!nohup ollama serve > output.log 2>&1 &
!ollama pull phi3

In [None]:
!pip install ollama faiss-cpu sentence-transformers numpy

Unoptimized Agent

In [None]:
import ollama
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import time
import logging
from typing import List, Tuple

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SimpleAgent:
    def __init__(self):
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.dimension = 384  # Dimension of the embedding model
        self.index = faiss.IndexFlatL2(self.dimension)
        self.documents = []
        self.token_usage = 0
        self.query_times = []

    def embed_text(self, text: str) -> np.ndarray:
        """Generate embedding for a given text."""
        return self.embedding_model.encode([text])[0]

    def add_document(self, document: str):
        """Add a document to the FAISS index."""
        start_time = time.time()
        embedding = self.embed_text(document)
        self.index.add(np.array([embedding], dtype=np.float32))
        self.documents.append(document)
        logger.info(f"Added document. Time: {time.time() - start_time:.4f}s")

    def query(self, question: str, k: int = 1) -> Tuple[str, int]:
        """Query the agent with a question."""
        start_time = time.time()

        # Generate embedding for the question
        question_embedding = self.embed_text(question)

        # Search FAISS index
        distances, indices = self.index.search(np.array([question_embedding], dtype=np.float32), k)

        # Retrieve relevant documents
        context = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        context_text = "\n".join(context)

        # Prepare prompt (unoptimized: verbose and redundant)
        prompt = f"""
        You are a helpful assistant. Given the following context and question, provide a detailed answer.

        Context:
        {context_text}

        Question:
        {question}

        Please provide a comprehensive response with all relevant details.
        """

        # Call Ollama Phi-4
        response = ollama.chat(
            model='phi3',
            messages=[{'role': 'user', 'content': prompt}]
        )

        # Track token usage (approximate)
        prompt_tokens = len(prompt.split())
        response_tokens = len(response['message']['content'].split())
        total_tokens = prompt_tokens + response_tokens
        self.token_usage += total_tokens

        # Track query time
        query_time = time.time() - start_time
        self.query_times.append(query_time)

        logger.info(f"Query processed. Time: {query_time:.4f}s, Tokens: {total_tokens}")

        return response['message']['content'], total_tokens

    def get_performance_metrics(self) -> dict:
        """Return performance metrics."""
        return {
            'total_token_usage': self.token_usage,
            'average_query_time': np.mean(self.query_times) if self.query_times else 0,
            'number_of_queries': len(self.query_times)
        }

# Example usage
if __name__ == "__main__":
    agent = SimpleAgent()

    # Add sample documents
    documents = [
        "The capital of France is Paris.",
        "Python is a popular programming language.",
        "The sun is a star."
    ]

    for doc in documents:
        agent.add_document(doc)

    # Test queries
    queries = [
        "What is the capital of France?",
        "What is Python?",
        "Is the sun a star?"
    ]

    for query in queries:
        answer, tokens = agent.query(query)
        print(f"Query: {query}")
        print(f"Answer: {answer}")
        print(f"Tokens used: {tokens}\n")

    # Print performance metrics
    metrics = agent.get_performance_metrics()
    print("Performance Metrics:")
    print(f"Total Token Usage: {metrics['total_token_usage']}")
    print(f"Average Query Time: {metrics['average_query_time']:.4f}s")
    print(f"Number of Queries: {metrics['number_of_queries']}")

Query: What is the capital of France?
Answer: The capital city of France is indeed Paris. Located in the north-central part of the country, along the Seine River within the Île de la Cité and spread across multiple districts such as Latin Quarter, Montmartre, Saint-Germain, Pigalle, Gare du Nord, Canal Monet (Banksy), Marais, Palais Royal, Place Vendôme, Luxembourg Gardens.

Paris is not only the political center but also a global hub for art, fashion, gastronomy, and culture since the 17th century and has been included in one or several UNESCO World Heritage Sites (centred around its historic cityscape). The population of Paris as per Census Reports from France'semn Bureau National de la Statistique is estimated to be about 2.2 million people, with a metropolitan region containing over 12 million residents and constituting one of the world’s most visited cities for tourists annually.

The city itself was founded in the 3rd century BC by the Parisii tribe under Roman rule as an establi

You Answer: Optimized Agent with performance metrics

Optimization Techniques Implemented:
(List down the techniques you've used)

- Embedding Cache: Used @lru_cache to cache query embeddings, reducing repeated computation.

- Batch Document Embedding: Used batch encoding for faster and more efficient document indexing.

- ANN Search: Switched from IndexFlatL2 to IndexIVFFlat for faster query time.

- Prompt Minimization: Reduced prompt size to cut down token usage and speed up response.

- Explicit Device Control: Forced model to use CPU to avoid latency due to implicit hardware switching.


In [None]:
from functools import lru_cache

class OptimizedAgent:
    def __init__(self):
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
        self.dimension = 384
        quantizer = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIVFFlat(quantizer, self.dimension, 5)
        self.index.nprobe = 10
        self.documents = []
        self.token_usage = 0
        self.query_times = []
        self.cache_hits = 0
        self.built = False

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        return self.embedding_model.encode(texts, batch_size=16, show_progress_bar=False)

    @lru_cache(maxsize=512)
    def embed_text(self, text: str) -> Tuple[bool, np.ndarray]:
        self.cache_hits += 1
        return True, self.embedding_model.encode([text])[0]

    def add_documents(self, documents: List[str]):
        start_time = time.time()
        embeddings = self.embed_texts(documents)
        self.index.train(np.array(embeddings, dtype=np.float32))
        self.index.add(np.array(embeddings, dtype=np.float32))
        self.documents.extend(documents)
        self.built = True
        logger.info(f"Batch added {len(documents)} documents. Time: {time.time() - start_time:.4f}s")

    def query(self, question: str, k: int = 1) -> Tuple[str, int]:
        start_time = time.time()
        cached, question_embedding = self.embed_text(question)
        distances, indices = self.index.search(np.array([question_embedding], dtype=np.float32), k)
        context = [self.documents[idx] for idx in indices[0] if idx != -1 and idx < len(self.documents)]

        context_text = "\n".join(context)
        prompt = f"Context:\n{context_text}\n\nQuestion: {question}\nPlease answer with a clear and complete sentence."
        response = ollama.chat(
            model='phi3',
            messages=[{'role': 'user', 'content': prompt}]
        )
        prompt_tokens = int(len(prompt.split()) * 1.3)
        response_tokens = int(len(response['message']['content'].split()) * 1.3)
        total_tokens = prompt_tokens + response_tokens
        self.token_usage += total_tokens
        query_time = time.time() - start_time
        self.query_times.append(query_time)
        logger.info(f"[Optimized] Query processed. Time: {query_time:.4f}s, Tokens: {total_tokens}, Cache Hit: {cached}")
        return response['message']['content'], total_tokens

    def get_performance_metrics(self) -> dict:
        return {
            'total_token_usage': self.token_usage,
            'average_query_time': np.mean(self.query_times) if self.query_times else 0,
            'number_of_queries': len(self.query_times),
            'cache_hits': self.cache_hits
        }

# Comparison Example
if __name__ == "__main__":
    simple_agent = SimpleAgent()
    optimized_agent = OptimizedAgent()

    sample_docs = [
        "The capital of France is Paris.",
        "Python is a popular programming language.",
        "The sun is a star.",
        "Bananas are rich in potassium.",
        "Mount Everest is the tallest mountain in the world."
    ]

    for doc in sample_docs:
        simple_agent.add_document(doc)

    optimized_agent.add_documents(sample_docs)

    queries = [
        "What is the capital of France?",
        "Tell me about Python.",
        "Is the sun a planet or star?",
        "What is Mount Everest?",
        "What mineral is in bananas?"
    ]

    print("\n--- Comparison Results ---\n")
    for q in queries:
        print(f"Query: {q}")
        simple_resp, simple_tokens = simple_agent.query(q)
        optimized_resp, optimized_tokens = optimized_agent.query(q)
        print(f"SimpleAgent Answer: {simple_resp}\nTokens: {simple_tokens}")
        print(f"OptimizedAgent Answer: {optimized_resp}\nTokens: {optimized_tokens}\n")

    print("--- Performance Metrics ---")
    s_metrics = simple_agent.get_performance_metrics()
    print("\nPerformance Metrics for Simple Agent:")
    print(f"Total Token Usage: {s_metrics['total_token_usage']}")
    print(f"Average Query Time: {s_metrics['average_query_time']:.4f}s")
    print(f"Number of Queries: {s_metrics['number_of_queries']}")
    o_metrics = optimized_agent.get_performance_metrics()
    print("\nPerformance Metrics for Optimized Agent:")
    print(f"Total Token Usage: {o_metrics['total_token_usage']}")
    print(f"Average Query Time: {o_metrics['average_query_time']:.4f}s")
    print(f"Number of Queries: {o_metrics['number_of_queries']}")




--- Comparison Results ---

Query: What is the capital of France?
SimpleAgent Answer: The capital of France is Paris. Situated in the north-central part of the country, along the Seine River within an area known as Île-dedependent for its geographical and cultural significance to the city's identity. Often referred to as "The City of Light" (La Ville Lumière), this title honors both Paris’ historical role during the Age of Enlightenment in fostering educational excellence, revolutionary ideas that influenced Europe considerably since the 18th century and its early adoption of street lighting.

Paris is not only France's capital city but also a global center for art, fashion, gastronomy, and culture within the French region of Île-de-France where it serves as an economic engine contributing significantly to both national GDP and international tourism revenue. The heartbeat of Paris can be felt at its world-famous landmarks like the Eiffel Tower (constructed in 1889 for the Exposition U

Performance Improvement: (Comparision)
- Simple Agent:
  - Total Token Usage: 2400
  - Average Query Time: 9.5963s
  - Number of Queries: 5

- Optimized Agent:
  - Total Token Usage: 314
  - Average Query Time: 0.6503s
  - Number of Queries: 5
