In [1]:
%pip install psycopg2 numpy sentence-transformers

Collecting psycopg2
  Downloading psycopg2-2.9.10-cp313-cp313-win_amd64.whl.metadata (4.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading psycopg2-2.9.10-cp313-cp313-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.6 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.6 MB 775.9 kB/s eta 

In [12]:
import psycopg2
from psycopg2.extras import execute_values, Json
import numpy as np
from typing import List, Tuple
from sentence_transformers import SentenceTransformer

In [None]:
# Import necessary libraries for the class
from typing import List, Tuple, Dict
import psycopg2
from psycopg2.extras import execute_values, Json
from sentence_transformers import SentenceTransformer

# PostgresVectorStore Class Definition
class PostgresVectorStore:
    """
    A vector store implementation using PostgreSQL with the pgvector extension
    for storing and searching text embeddings.
    """
    def __init__(self, connection_string: str, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the vector store with a PostgreSQL connection string
        and a SentenceTransformer model.

        Args:
            connection_string: PostgreSQL connection string (e.g., "postgresql://user:pass@host:port/dbname")
            model_name: Name of the sentence-transformers model to use for embeddings.
                        Defaults to "all-MiniLM-L6-v2" which produces 384-dimensional embeddings.
        """
        self.conn_string = connection_string
        print(f"Loading SentenceTransformer model: {model_name}...")
        try:
            # Initialize the SentenceTransformer model for generating embeddings
            self.model = SentenceTransformer(model_name)
            # Determine the embedding dimension based on the model
            # This is crucial for creating the VECTOR column in PostgreSQL
            self.embedding_dimension = self.model.get_sentence_embedding_dimension()
            print(f"Initialized embedding model: {model_name} with dimension {self.embedding_dimension}")
        except Exception as e:
            print(f"Error loading SentenceTransformer model: {e}")
            print("Please check your internet connection or model name.")
            raise

        # Ensure the database and table are set up
        self._setup_database()

    def _setup_database(self):
        """
        Internal method to create the 'vector' extension and
        'document_embeddings' table if they don't already exist.
        This method now also drops the table if it exists to ensure a clean state.
        """
        try:
            with psycopg2.connect(self.conn_string) as conn:
                with conn.cursor() as cur:
                    # Enable the pgvector extension
                    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
                    
                    # Drop table if it exists to ensure a clean slate ---
                    cur.execute("DROP TABLE IF EXISTS document_embeddings;")
                    # --- END ADDED ---

                    # Create the table for document embeddings
                    # The VECTOR column dimension must match your model's output
                    cur.execute(f"""
                        CREATE TABLE document_embeddings (
                            id SERIAL PRIMARY KEY,
                            content TEXT NOT NULL,
                            embedding VECTOR({self.embedding_dimension}) NOT NULL,
                            metadata JSONB
                        );
                    """)
                conn.commit() # Commit the transaction to save changes
            print("Database setup (extension and table) complete.")
        except psycopg2.OperationalError as e:
            print(f"Database connection error during setup: {e}")
            print("Please ensure PostgreSQL is running and connection string is correct.")
            raise
        except Exception as e:
            print(f"Error during database setup: {e}")
            raise

    def add_texts(self, texts: List[str], metadatas: List[Dict] = None):
        """
        Add texts and their associated metadata to the vector store.
        Texts are first converted into embeddings using the initialized model.

        Args:
            texts: A list of text strings to embed and store.
            metadatas: An optional list of metadata dictionaries,
                       corresponding to each text. If None, empty dicts are used.
        """
        if metadatas is None:
            # If no metadata is provided, create an empty dictionary for each text
            metadatas = [{} for _ in texts]
        
        # Ensure texts and metadatas have the same length
        if len(texts) != len(metadatas):
            raise ValueError("Lengths of texts and metadatas must match.")

        print(f"Generating embeddings for {len(texts)} texts...")
        try:
            # Generate embeddings for all texts in a batch
            # .tolist() converts numpy arrays to Python lists for JSON serialization
            embeddings = self.model.encode(texts).tolist()
            print("Embeddings generated.")
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            print("Please check the input 'texts' or the embedding model.")
            raise

        # Prepare documents as tuples for batch insertion
        # Each tuple: (content_string, embedding_list, metadata_dict)
        documents = [
            (text, embedding, metadata)
            for text, embedding, metadata in zip(texts, embeddings, metadatas)
        ]

        try:
            with psycopg2.connect(self.conn_string) as conn:
                with conn.cursor() as cur:
                    # Prepare values for batch insertion using execute_values
                    # Json(doc[2]) ensures metadata is stored as JSONB
                    values = [(doc[0], doc[1], Json(doc[2])) for doc in documents]
                    execute_values(cur, """
                        INSERT INTO document_embeddings (content, embedding, metadata)
                        VALUES %s
                    """, values)
                conn.commit() # Commit the transaction to save changes
            print(f"Successfully added {len(texts)} documents to the vector store.")
        except psycopg2.Error as e:
            print(f"Database error adding documents: {e}")
            print("Please check your table schema (especially 'embedding' dimension) and data types.")
            raise
        except Exception as e:
            print(f"An unexpected error occurred while adding documents: {e}")
            raise

    def count_documents(self) -> int:
        """
        Counts the number of documents currently stored in the vector store.

        Returns:
            The total number of rows in the document_embeddings table.
        """
        try:
            with psycopg2.connect(self.conn_string) as conn:
                with conn.cursor() as cur:
                    cur.execute("SELECT COUNT(*) FROM document_embeddings;")
                    count = cur.fetchone()[0]
                    return count
        except psycopg2.Error as e:
            print(f"Database error counting documents: {e}")
            return -1 # Indicate an error
        except Exception as e:
            print(f"An unexpected error occurred while counting documents: {e}")
            return -1

    def similarity_search(
        self,
        query: str,
        limit: int = 5
    ) -> List[Tuple[str, float, Dict]]:
        """
        Perform a similarity search against the stored embeddings using cosine similarity.

        Args:
            query: The text query string to search for.
            limit: The maximum number of similar results to return.

        Returns:
            A list of tuples, where each tuple contains:
            (content_string, similarity_score, metadata_dictionary)
            Results are ordered by similarity (highest first).
        """
        print(f"Generating embedding for query: '{query}'...")
        try:
            # Generate embedding for the query text
            query_embedding = self.model.encode(query).tolist()
            print("Query embedding generated.")
        except Exception as e:
            print(f"Error generating query embedding: {e}")
            print("Please check the input 'query' or the embedding model.")
            raise

        try:
            with psycopg2.connect(self.conn_string) as conn:
                with conn.cursor() as cur:
                    # Use the <=> operator for cosine distance (1 - distance = similarity)
                    # The ::vector cast is crucial for pgvector
                    cur.execute("""
                        SELECT
                            content,
                            1 - (embedding <=> %s::vector) as similarity,
                            metadata
                        FROM document_embeddings
                        ORDER BY embedding <=> %s::vector
                        LIMIT %s
                    """, (query_embedding, query_embedding, limit)) # Pass query_embedding twice for ORDER BY

                    results = cur.fetchall()
                    print(f"Found {len(results)} results.")
                    return results
        except psycopg2.Error as e:
            print(f"Database error during similarity search: {e}")
            print("Please ensure pgvector extension is enabled and 'embedding' column is of type VECTOR.")
            return [] # Return an empty list on database error
        except Exception as e:
            print(f"An unexpected error occurred during similarity search: {e}")
            return []


In [None]:
import time

if __name__ == "__main__":
    # IMPORTANT: Ensure your PostgreSQL container with pgvector is running and accessible.
    # Initialize vector store with Docker PostgreSQL connection
    # Ensure the connection string matches your Docker setup's user, password, host, port, and dbname.
    print("Initializing vector store...")
    try:
        store = PostgresVectorStore(
            "postgresql://vectordb:vectorpass@localhost:5432/vectordb"
        )
        print("Vector store initialized successfully.")
    except Exception as e:
        print(f"Failed to initialize vector store: {e}")
        print("Please ensure your PostgreSQL database with pgvector is running and accessible.")
        print("Check your Docker container status (`docker ps`) and logs (`docker logs <container_id>`).")
        exit() # Exit if initialization fails, as subsequent operations will fail too.

    # Example texts and metadata to add to the vector store
    texts = [
        "The quick brown fox jumps over the lazy dog",
        "Machine learning is a subset of artificial intelligence",
        "Python is a versatile programming language",
        "Natural language processing helps computers understand human language",
        "Vector databases are optimized for similarity search"
    ]
    
    metadatas = [
        {"source": "sample1", "category": "pangram"},
        {"source": "sample2", "category": "technology"},
        {"source": "sample3", "category": "programming"},
        {"source": "sample4", "category": "nlp"},
        {"source": "sample5", "category": "databases"}
    ]

    # Add documents to the vector store
    print(f"\nAttempting to add {len(texts)} documents...")
    try:
        store.add_texts(texts, metadatas)
        print("Documents added successfully.")
    except Exception as e:
        print(f"Error adding documents: {e}")
        print("This could be due to database connection, table schema (especially 'embedding' dimension), or data types.")
        # If adding fails, exit as search will likely not work too.

    # Verify document count
    doc_count = store.count_documents()
    if doc_count > 0:
        print(f"Confirmed {doc_count} documents in the database.")
    else:
        print(f"No documents found in the database ({doc_count} documents). This is why search might fail.")
        print("Please ensure `add_texts` completed without errors and data was committed.")
        # If no documents are found, it's pointless to search, so we can exit or skip search.

    # Perform similarity search
    query = "Tell me about AI and machine learning"
    print(f"\nPerforming similarity search for query: '{query}'")

    start_time = time.time()
    results = store.similarity_search(query, limit=2)
    end_time = time.time()

    print(f"Search completed in {end_time - start_time:.2f} seconds.")

    # Print search results
    if results:
        print("\n--- Search Results ---")
        for content, similarity, metadata in results:
            print(f"Content: {content}")
            print(f"Similarity: {similarity:.4f}")
            print(f"Metadata: {metadata}")
            print("---")
    else:
        print("\nNo results found. This might happen if:")
        print("- No documents were successfully added to the database (check count above).")
        print("- The search query did not yield relevant results (e.g., very low similarity).")
        print("- There was an error during the search operation (check previous error messages).")


Initializing vector store...
Loading SentenceTransformer model: all-MiniLM-L6-v2...
Initialized embedding model: all-MiniLM-L6-v2 with dimension 384
Database setup (extension and table) complete.
Vector store initialized successfully.

Attempting to add 5 documents...
Generating embeddings for 5 texts...
Embeddings generated.
Successfully added 5 documents to the vector store.
Documents added successfully.
Confirmed 5 documents in the database.

Performing similarity search for query: 'Tell me about AI and machine learning'
Generating embedding for query: 'Tell me about AI and machine learning'...
Query embedding generated.
Found 2 results.
Search completed in 0.30 seconds.

--- Search Results ---
Content: Machine learning is a subset of artificial intelligence
Similarity: 0.7109
Metadata: {'source': 'sample2', 'category': 'technology'}
---
Content: Natural language processing helps computers understand human language
Similarity: 0.3885
Metadata: {'source': 'sample4', 'category': 'nlp'