In [None]:
import psycopg2
import json
from typing import Optional, Tuple

## Load-or-Generate Function

In [None]:
def load_or_generate(db_connection, embedding_model: str, embedding_alias: str,
                     dataset: list = None, target_size_mb: int = None,
                     max_chunk_size: int = 1000,
                     chunk_source_dataset: str = None,
                     preserve_existing: Optional[bool] = None,
                     embedding_generation_func = None) -> 'PostgreSQLVectorDB':
    """Smart loader: check registry, load if exists and compatible, generate if not.
    
    This pattern enables advanced notebooks to work flexibly:
    - If embeddings exist in DB → load immediately (no regeneration)
    - If not in DB but dataset available → optionally generate
    - Check metadata for compatibility (dimension, chunk config, etc.)
    
    Args:
        db_connection: PostgreSQL connection
        embedding_model: Model identifier (e.g., 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf')
        embedding_alias: Short alias for registry (e.g., 'bge_base_en_v1_5')
        dataset: List of text chunks (optional, only needed for generation)
        target_size_mb: Dataset size goal (optional)
        max_chunk_size: Maximum chunk size in characters
        chunk_source_dataset: Description of the data source
        preserve_existing: If True, always load; if False, always regenerate;
                          if None, prompt user interactively
        embedding_generation_func: Function to call if generation needed.
                                   Should accept (dataset, embedding_model, embedding_alias, db)
                                   and populate the embeddings table
    
    Returns:
        PostgreSQLVectorDB instance ready for use
        
    Raises:
        ValueError: If embeddings don't exist and no generation function provided
    """
    # First, check if embeddings exist in registry
    with db_connection.cursor() as cur:
        cur.execute('''
            SELECT id, dimension, embedding_count, metadata_json
            FROM embedding_registry
            WHERE model_alias = %s
        ''', (embedding_alias,))
        registry_entry = cur.fetchone()
    
    if registry_entry:
        reg_id, dimension, embedding_count, metadata = registry_entry
        
        print(f"\n✓ Found {embedding_count} existing embeddings for '{embedding_alias}'")
        print(f"  Dimension: {dimension}")
        print(f"  Created: {metadata.get('created_at', 'unknown') if metadata else 'unknown'}")
        
        # Determine whether to load or regenerate
        should_load = True
        
        if preserve_existing is False:
            should_load = False
            print(f"\n⚠️  Will regenerate embeddings (preserve_existing=False)")
        elif preserve_existing is None:
            # Interactive prompt
            while True:
                response = input(f"\nLoad existing embeddings? [(l)oad / (r)egenerate / (c)ancel]: ").lower().strip()
                if response in ['l', 'load']:
                    should_load = True
                    break
                elif response in ['r', 'regenerate']:
                    should_load = False
                    break
                elif response in ['c', 'cancel']:
                    raise ValueError("User cancelled operation")
        
        if should_load:
            # Load from PostgreSQL
            table_name = f'embeddings_{embedding_alias.replace(".", "_")}'
            return PostgreSQLVectorDB(
                db_connection.get_dsn_parameters(),
                table_name,
                preserve_existing=True  # Already exists
            )
    
    # If we reach here, either no embeddings exist or user chose to regenerate
    if not embedding_generation_func:
        raise ValueError(
            f"No existing embeddings for '{embedding_alias}' and "
            "no generation function provided. "
            "Pass embedding_generation_func parameter."
        )
    
    if not dataset:
        raise ValueError(
            f"Dataset required for generating embeddings. "
            "Pass the dataset parameter."
        )
    
    print(f"\nGenerating new embeddings for '{embedding_alias}'...")
    print(f"Dataset: {len(dataset)} chunks")
    
    # Call the generation function
    return embedding_generation_func(
        dataset=dataset,
        embedding_model=embedding_model,
        embedding_alias=embedding_alias,
        db=db_connection,
        max_chunk_size=max_chunk_size,
        chunk_source_dataset=chunk_source_dataset
    )

## Usage Example in Advanced Notebook

```python
# At top of your advanced technique notebook:

from foundation.00_registry_and_tracking_utilities import (
    list_available_embeddings,
    load_or_generate
)

# Show available embeddings
available = list_available_embeddings(db)
print(available)

# Define your embedding generation function
def generate_embeddings(dataset, embedding_model, embedding_alias, db, 
                        max_chunk_size, chunk_source_dataset):
    """Your existing embedding generation logic."""
    # ... implementation ...
    register_embedding(db, embedding_alias, ...)  # Register after generating
    return PostgreSQLVectorDB(...)  # Return DB instance

# Use load_or_generate for flexibility
db = load_or_generate(
    db_connection=db,
    embedding_model=EMBEDDING_MODEL,
    embedding_alias=EMBEDDING_MODEL_ALIAS,
    dataset=dataset if regenerate else None,  # Only pass if regenerating
    chunk_source_dataset=f'Wikipedia {TARGET_SIZE_MB}MB',
    preserve_existing=True,  # Reuse if exists
    embedding_generation_func=generate_embeddings
)

# Now db is ready, with minimal overhead if embeddings already exist
```