## Prerequisites

1. PostgreSQL running (via Docker or locally)
2. psycopg2-binary installed
3. The `pgvector` extension already created (from previous RAG setup)

In [None]:
import psycopg2
import json
from datetime import datetime

## Configuration

In [None]:
# PostgreSQL connection
POSTGRES_CONFIG = {
    'host': 'localhost',
    'port': 5432,
    'database': 'rag_db',
    'user': 'postgres',
    'password': 'postgres',
}

## Connect to Database

In [None]:
try:
    conn = psycopg2.connect(
        host=POSTGRES_CONFIG['host'],
        port=POSTGRES_CONFIG['port'],
        database=POSTGRES_CONFIG['database'],
        user=POSTGRES_CONFIG['user'],
        password=POSTGRES_CONFIG['password']
    )
    print(f"✓ Connected to PostgreSQL at {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}")
except psycopg2.OperationalError as e:
    print(f"✗ Failed to connect to PostgreSQL: {e}")
    print("Make sure PostgreSQL is running. Start with:")
    print("docker run -d --name pgvector-rag \\")
    print("  -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=rag_db \\")
    print("  -p 5432:5432 -v pgvector_data:/var/lib/postgresql/data \\")
    print("  pgvector/pgvector:pg16")
    raise

## Create Schema Tables

In [None]:
# Create embedding_registry table
with conn.cursor() as cur:
    cur.execute('''
        CREATE TABLE IF NOT EXISTS embedding_registry (
            id SERIAL PRIMARY KEY,
            model_alias TEXT UNIQUE NOT NULL,
            model_name TEXT NOT NULL,
            dimension INT NOT NULL,
            embedding_count INT DEFAULT 0,
            chunk_source_dataset TEXT,
            chunk_size_config INT,
            metadata_json JSONB DEFAULT '{}'::jsonb,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    conn.commit()
    print("✓ Created embedding_registry table")

In [None]:
# Create evaluation_groundtruth table
with conn.cursor() as cur:
    cur.execute('''
        CREATE TABLE IF NOT EXISTS evaluation_groundtruth (
            id SERIAL PRIMARY KEY,
            question TEXT NOT NULL,
            source_type TEXT CHECK (source_type IN ('llm_generated', 'template_based', 'manual')),
            relevant_chunk_ids INT ARRAY,
            quality_rating TEXT CHECK (quality_rating IN ('good', 'bad', 'ambiguous', 'rejected')),
            human_notes TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            created_by TEXT
        )
    ''')
    conn.commit()
    print("✓ Created evaluation_groundtruth table")

In [None]:
# Create experiments table
with conn.cursor() as cur:
    cur.execute('''
        CREATE TABLE IF NOT EXISTS experiments (
            id SERIAL PRIMARY KEY,
            experiment_name TEXT NOT NULL,
            notebook_path TEXT,
            embedding_model_alias TEXT,
            config_hash TEXT,
            config_json JSONB,
            techniques_applied TEXT ARRAY DEFAULT '{}'::text[],
            started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            completed_at TIMESTAMP,
            status TEXT DEFAULT 'running' CHECK (status IN ('running', 'completed', 'failed')),
            notes TEXT,
            FOREIGN KEY (embedding_model_alias) REFERENCES embedding_registry(model_alias)
        )
    ''')
    conn.commit()
    print("✓ Created experiments table")

In [None]:
# Create evaluation_results table
with conn.cursor() as cur:
    cur.execute('''
        CREATE TABLE IF NOT EXISTS evaluation_results (
            id SERIAL PRIMARY KEY,
            experiment_id INT NOT NULL,
            metric_name TEXT NOT NULL,
            metric_value FLOAT NOT NULL,
            metric_details_json JSONB DEFAULT '{}'::jsonb,
            computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE
        )
    ''')
    conn.commit()
    print("✓ Created evaluation_results table")

## Create Indexes for Query Performance

In [None]:
# Create indexes for common queries
indexes = [
    "CREATE INDEX IF NOT EXISTS idx_experiments_embedding_model ON experiments(embedding_model_alias)",
    "CREATE INDEX IF NOT EXISTS idx_experiments_status ON experiments(status)",
    "CREATE INDEX IF NOT EXISTS idx_experiments_started ON experiments(started_at DESC)",
    "CREATE INDEX IF NOT EXISTS idx_results_experiment ON evaluation_results(experiment_id)",
    "CREATE INDEX IF NOT EXISTS idx_results_metric ON evaluation_results(metric_name)",
    "CREATE INDEX IF NOT EXISTS idx_groundtruth_quality ON evaluation_groundtruth(quality_rating)",
]

with conn.cursor() as cur:
    for idx in indexes:
        cur.execute(idx)
    conn.commit()
    print(f"✓ Created {len(indexes)} indexes")

## Verify Schema Creation

In [None]:
# Verify all tables were created
with conn.cursor() as cur:
    cur.execute('''
        SELECT table_name FROM information_schema.tables 
        WHERE table_schema = 'public' 
        AND table_name IN ('embedding_registry', 'evaluation_groundtruth', 'experiments', 'evaluation_results')
        ORDER BY table_name
    ''')
    tables = cur.fetchall()
    print(f"✓ Schema creation complete. Tables found:")
    for (table_name,) in tables:
        print(f"  - {table_name}")

conn.close()
print("\n✓ Database connection closed. Ready to proceed!")