## Setup and Configuration

In [None]:
import ollama
import psycopg2
import json
import pandas as pd
import numpy as np
from statistics import mean, stdev
import time

In [None]:
# PostgreSQL connection
POSTGRES_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'rag_db',
    'user': 'postgres',
    'password': 'postgres',
}

# Models
EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
EMBEDDING_MODEL_ALIAS = 'bge_base_en_v1.5'

# Embedding table name
TABLE_NAME = f'embeddings_{EMBEDDING_MODEL_ALIAS.replace(".", "_")}'

print(f"Configuration:")
print(f"  PostgreSQL: {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}/{POSTGRES_CONFIG['database']}")
print(f"  Embedding model: {EMBEDDING_MODEL}")
print(f"  Table: {TABLE_NAME}")

## Helper Classes for Embedding Analysis

In [None]:
class PostgreSQLVectorDB:
    """Helper class to manage embeddings in PostgreSQL with pgvector."""
    
    def __init__(self, config, table_name):
        self.config = config
        self.table_name = table_name
        self.conn = None
        self.connect()
    
    def connect(self):
        try:
            self.conn = psycopg2.connect(
                host=self.config['host'],
                port=self.config['port'],
                database=self.config['database'],
                user=self.config['user'],
                password=self.config['password']
            )
            print(f'✓ Connected to PostgreSQL at {self.config["host"]}:{self.config["port"]}')
        except psycopg2.OperationalError as e:
            print(f'✗ Failed to connect to PostgreSQL: {e}')
            raise
    
    def get_chunk_count(self):
        with self.conn.cursor() as cur:
            cur.execute(f'SELECT COUNT(*) FROM {self.table_name}')
            return cur.fetchone()[0]
    
    def similarity_search(self, query_embedding, top_n=3):
        """Find most similar chunks using pgvector."""
        with self.conn.cursor() as cur:
            cur.execute(f'''
                SELECT chunk_text, 
                       1 - (embedding <=> %s::vector) as similarity
                FROM {self.table_name}
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            ''', (query_embedding, query_embedding, top_n))
            
            results = cur.fetchall()
            return [(chunk, score) for chunk, score in results]
    
    def get_all_similarity_scores(self, query_embedding):
        """Get similarity scores for all chunks (for statistical analysis)."""
        with self.conn.cursor() as cur:
            cur.execute(f'''
                SELECT 1 - (embedding <=> %s::vector) as similarity
                FROM {self.table_name}
                ORDER BY similarity DESC
            ''', (query_embedding,))
            
            return [row[0] for row in cur.fetchall()]
    
    def close(self):
        if self.conn:
            self.conn.close()

# Connect to database
db = PostgreSQLVectorDB(POSTGRES_CONFIG, TABLE_NAME)
count = db.get_chunk_count()
print(f'\n✓ Loaded {count} embeddings from database')

## 1. Analyze Retrieval Quality by Query Type

In [None]:
def analyze_retrieval_quality(queries_by_type, top_n=5):
    """Test different query types and analyze retrieval quality.
    
    Args:
        queries_by_type: Dict mapping query type to list of queries
        top_n: Number of results to retrieve per query
    
    Returns:
        DataFrame with query type, query text, and top similarity scores
    """
    results = []
    
    for query_type, queries in queries_by_type.items():
        print(f"\nAnalyzing {query_type} queries...")
        
        type_results = {
            'query_type': query_type,
            'query_count': len(queries),
            'avg_top1_similarity': [],
            'avg_top5_similarity': [],
            'avg_all_similarities': [],
        }
        
        for query in queries:
            query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
            
            # Get top N results
            top_results = db.similarity_search(query_embedding, top_n=top_n)
            
            # Get all similarity scores for statistical analysis
            all_scores = db.get_all_similarity_scores(query_embedding)
            
            # Record metrics
            if top_results:
                top1_score = top_results[0][1]
                top5_scores = [score for _, score in top_results[:5]]
                
                type_results['avg_top1_similarity'].append(top1_score)
                type_results['avg_top5_similarity'].append(mean(top5_scores) if top5_scores else 0)
                type_results['avg_all_similarities'].append(mean(all_scores))
        
        # Compute averages
        results.append({
            'Query Type': query_type,
            'Test Count': len(queries),
            'Avg Top-1 Similarity': mean(type_results['avg_top1_similarity']) if type_results['avg_top1_similarity'] else 0,
            'Avg Top-5 Similarity': mean(type_results['avg_top5_similarity']) if type_results['avg_top5_similarity'] else 0,
            'Avg All Similarities': mean(type_results['avg_all_similarities']) if type_results['avg_all_similarities'] else 0,
        })
    
    return pd.DataFrame(results)

# Example: Test different query types
test_queries = {
    'Science': [
        'What is photosynthesis?',
        'How does DNA work?',
        'What is quantum physics?',
    ],
    'Geography': [
        'What is the capital of France?',
        'Where is Mount Everest?',
        'What country is Tokyo in?',
    ],
    'History': [
        'When did World War 2 end?',
        'Who was Napoleon?',
        'What was the Renaissance?',
    ],
}

quality_df = analyze_retrieval_quality(test_queries, top_n=5)
print("\n=== Retrieval Quality by Query Type ===")
print(quality_df.to_string(index=False))

## 2. Debug Poor Retrievals

In [None]:
def debug_retrieval(query, top_n=10):
    """Detailed analysis of what gets retrieved for a query.
    
    Shows:
    - Top N chunks with similarity scores
    - Quality assessment (is top result actually relevant?)
    - Distribution of similarity scores
    """
    query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
    results = db.similarity_search(query_embedding, top_n=top_n)
    
    print(f"Query: '{query}'\n")
    print("Top retrievals:")
    
    for i, (chunk, score) in enumerate(results, 1):
        # Extract article title
        title = chunk.split('\n')[0].replace('Article: ', '')
        preview = chunk[:150].replace('\n', ' ') + '...'
        
        print(f"\n  [{i}] Similarity: {score:.4f}")
        print(f"      Article: {title}")
        print(f"      Preview: {preview}")
    
    # Statistics
    scores = [score for _, score in results]
    print(f"\n=== Similarity Statistics ===")
    print(f"Max: {max(scores):.4f}")
    print(f"Min: {min(scores):.4f}")
    print(f"Mean: {mean(scores):.4f}")
    if len(scores) > 1:
        print(f"Stdev: {stdev(scores):.4f}")

# Example: Debug a specific query
debug_retrieval('What is machine learning?', top_n=5)

## 3. Compare Embedding Models (if multiple registered)

In [None]:
def compare_embedding_models(query, models_and_tables):
    """Compare retrieval results across different embedding models.
    
    Args:
        query: Test query
        models_and_tables: Dict mapping model name to table name
    
    Returns:
        DataFrame comparing results from each model
    """
    results = []
    
    for model_name, table_name in models_and_tables.items():
        try:
            model_db = PostgreSQLVectorDB(POSTGRES_CONFIG, table_name)
            
            # For model comparison, use appropriate embedding endpoint
            # Note: This is simplified - in reality you'd need the model's embedding API
            query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
            
            top_result = model_db.similarity_search(query_embedding, top_n=1)
            
            if top_result:
                chunk, score = top_result[0]
                title = chunk.split('\n')[0].replace('Article: ', '')
                
                results.append({
                    'Model': model_name,
                    'Top Similarity': score,
                    'Top Article': title,
                })
            
            model_db.close()
        except Exception as e:
            print(f"Could not compare with {model_name}: {e}")
    
    return pd.DataFrame(results)

print("Model comparison is available if you have multiple embedding models registered.")
print("\nTo generate embeddings with different models:")
print("  1. Run foundation/02-rag-postgresql-persistent.ipynb with different EMBEDDING_MODEL")
print("  2. Each model creates its own embeddings_* table")
print("  3. Use this function to compare retrieval quality across models")

## 4. Retrieval Performance Profiling

In [None]:
def profile_retrieval_speed(queries, num_runs=1):
    """Measure retrieval latency and identify bottlenecks.
    
    Args:
        queries: List of test queries
        num_runs: Number of times to run each query (for averaging)
    
    Returns:
        DataFrame with latency metrics
    """
    embedding_times = []
    retrieval_times = []
    
    print(f"Profiling {len(queries)} queries (averaging {num_runs} runs)...\n")
    
    for query in queries:
        # Time embedding generation
        start = time.time()
        for _ in range(num_runs):
            query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
        embedding_time = (time.time() - start) / num_runs * 1000  # ms
        embedding_times.append(embedding_time)
        
        # Time similarity search
        start = time.time()
        for _ in range(num_runs):
            results = db.similarity_search(query_embedding, top_n=5)
        retrieval_time = (time.time() - start) / num_runs * 1000  # ms
        retrieval_times.append(retrieval_time)
    
    print(f"=== Performance Profile ===")
    print(f"\nEmbedding Generation (BGE model):")
    print(f"  Mean: {mean(embedding_times):.2f} ms")
    print(f"  Min: {min(embedding_times):.2f} ms")
    print(f"  Max: {max(embedding_times):.2f} ms")
    
    print(f"\nRetrieval (PostgreSQL pgvector):")
    print(f"  Mean: {mean(retrieval_times):.2f} ms")
    print(f"  Min: {min(retrieval_times):.2f} ms")
    print(f"  Max: {max(retrieval_times):.2f} ms")
    
    print(f"\nTotal Per-Query Time: {mean(embedding_times) + mean(retrieval_times):.2f} ms")
    
    return {
        'embedding_mean_ms': mean(embedding_times),
        'retrieval_mean_ms': mean(retrieval_times),
        'total_mean_ms': mean(embedding_times) + mean(retrieval_times),
    }

# Profile with test queries
profile_queries = [
    'What is photosynthesis?',
    'Who was Albert Einstein?',
    'What is Python?',
]

profile_metrics = profile_retrieval_speed(profile_queries, num_runs=1)

## Cleanup

In [None]:
# Close database connection when done
db.close()
print("✓ Database connection closed")