In [None]:
# https://www.timescale.com/blog/combining-semantic-search-and-full-text-search-in-postgresql-with-cohere-pgvector-and-pgai

In [None]:
-- Create a table with a vector column
CREATE TABLE vectors (
    id SERIAL PRIMARY KEY,
    vector vector(1536)
);

-- Insert some data
INSERT INTO vectors (vector) VALUES
    (vector(1.0, 2.0, ..., 1536.0)),
    (vector(1.0, 2.0, ..., 1536.0)),
    (vector(1.0, 2.0, ..., 1536.0));

-- Query the table using the vector column
SELECT * FROM vectors WHERE vector <-> vector(0.0, ..., 0.0) < 10;

In [None]:
# https://www.datacamp.com/tutorial/pgvector-tutorial

In [None]:
create table documents (
  id bigserial primary key,
  content text,
  embedding vector(1536)
);

create or replace function match_documents (
  query_embedding vector(1536),
  match_threshold float,
  match_count int
)
returns table (
  id bigint,
  content text,
  similarity float
)
language sql stable
as $$
  select
    documents.id,
    documents.content,
    1 - (documents.embedding <=> query_embedding) as similarity
  from documents
  where documents.embedding <=> query_embedding < 1 - match_threshold
  order by documents.embedding <=> query_embedding
  limit match_count;
$$;


In [3]:
import os
from ast import literal_eval
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from openai import OpenAI
import openai
import psycopg2

In [5]:
load_dotenv(find_dotenv())

True

In [6]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Set up your OpenAI API key
# openai.api_key = OPENAI_API_KEY
client = OpenAI()
# Choose a model
model = "text-embedding-ada-002"
print(OPENAI_API_KEY[:12])

sk-proj-_YC6


In [8]:
print("Connecting to PostgreSQL...")
conn = psycopg2.connect(
    database="postgres",
    user="postgres",
    password="postgres",
    host="host.docker.internal",
)

cursor = conn.cursor()

print("Successfully connected to PostgreSQL.")

Connecting to PostgreSQL...
Successfully connected to PostgreSQL.


In [10]:
# Converting the prompt to the pgvector embedding
def get_embedding(prompt):
    response = openai.embeddings.create(input=prompt, model="text-embedding-ada-002")

    embedding = response.data[0].embedding

    # Converting the embedding to the pgvector and returning it
    return "[" + ",".join(map(str, embedding)) + "]"


# Getting the matching threshold for the similarity search
def get_matching_threshold():
    return 0.7


# Getting the number of matching movies to return
def get_matching_count():
    return 3

Second, find the most relevant movies for a provided user prompt by calculating the cosine distance (`<=>`) between the prompt's and movies' embeddings:


In [9]:
user_prompt = "A movie about an fraternity."

prompt_vector = get_embedding(user_prompt)

cursor.execute(
    "SELECT title, overview "
    "FROM movie WHERE 1 - (overview_vector <=> %(prompt_vector)s) >= %(match_threshold)s "
    "ORDER BY overview_vector <=> %(prompt_vector)s LIMIT %(match_cnt)s",
    {
        "prompt_vector": prompt_vector,
        "match_threshold": get_matching_threshold(),
        "match_cnt": get_matching_count(),
    },
)

result = cursor.fetchall()

for row in result:
    print(row)

('Fraternity Row', "Director Thomas J. Tobin's 1977 drama about college freshmen subjected to fraternity hazing stars Gregory Harrison, Peter Fox, Scott Newman, Nancy Morgan and Wendy Phillips.")
('Fraternity Row', "Director Thomas J. Tobin's 1977 drama about college freshmen subjected to fraternity hazing stars Gregory Harrison, Peter Fox, Scott Newman, Nancy Morgan and Wendy Phillips.")
('Lee Rock', 'The film chronicles the rise and fall of a corrupt police force that Lee Rock becomes a part of.')


## Pre-Filter Data Before Similarity Search

As a general-purpose relational database, PostgreSQL allows you to pre-filter data before a vector search is started. You can pre-filter by specifying a condition on non-vector columns in the `WHERE` clause of a query statement.

For instance, imagine the user selecting the `Science Fiction` category and asking to suggest movies with a rating of `7` or higher. Then, the user prompts for `A movie about a space adventure`. The final SQL query can look as follows:


In [7]:
user_prompt = "A movie about a space adventure."

prompt_vector = get_embedding(user_prompt)

cursor.execute(
    "SELECT title, vote_average, genres "
    "FROM movie WHERE vote_average >= 7 "
    'AND genres @> \'[{"name": "Science Fiction"}]\' '
    "AND 1 - (overview_vector <=> %(prompt_vector)s) >= %(match_threshold)s "
    "ORDER BY overview_vector <=> %(prompt_vector)s LIMIT %(match_cnt)s",
    {
        "prompt_vector": prompt_vector,
        "match_threshold": get_matching_threshold(),
        "match_cnt": get_matching_count(),
    },
)

result = cursor.fetchall()

for row in result:
    print(row)