In [20]:
PGVECTOR_CONNECTION_STRING='postgresql://admin:5tgb%25TGB@localhost:45048/famaga'
PGVECTOR_COLLECTION_NAME='details'
TOP_K = 3
SIMILARITY_SEARCH_LIMIT=0.1

In [21]:
import openai
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key = os.getenv('OPENAI_API_KEY'))

EMBEDDINGS_MODEL = 'text-embedding-ada-002'
INDEX_DIMENSIONS = 1536

# print('OpenAI Engines: ', client.Engine.list())

In [14]:
pip install pgvector psycopg2-binary tiktoken

Collecting pgvector
  Obtaining dependency information for pgvector from https://files.pythonhosted.org/packages/ff/70/4121568743eff331240def4d0b0e949f3cd36f440435a69f967ebd1f0bc6/pgvector-0.2.4-py2.py3-none-any.whl.metadata
  Downloading pgvector-0.2.4-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting psycopg2-binary
  Obtaining dependency information for psycopg2-binary from https://files.pythonhosted.org/packages/ce/85/62825cabc6aad53104b7b6d12eb2ad74737d268630032d07b74d4444cb72/psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pgvector-0.2.4-py2.py3-none-any.whl (9.6 kB)
Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[

In [22]:
import json
import psycopg2
import pgvector
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
import pandas as pd

In [23]:
db_connection = psycopg2.connect(PGVECTOR_CONNECTION_STRING)
db_cursor = db_connection.cursor()
db_connection.autocommit = True

register_vector(db_connection)

In [24]:
db_cursor.execute('CREATE TABLE IF NOT EXISTS detail_brands (brand_id SERIAL PRIMARY KEY, name VARCHAR(200))')

In [24]:
df = pd.read_csv('./assets/brands.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,brand_id,title
0,0,986,Aignep
1,1,4990,GMT
2,2,891,Airtac
3,3,445,BRINKMANN
4,4,17000,Weiss Technik


In [None]:
filtered_df = df[df.apply(lambda row: all(isinstance(row[col], str) for col in ['title', ]), axis=1)]

values_str = ', '.join(filtered_df.apply(
    lambda row: "({brand_id}, '{name}')".format(brand_id=row['brand_id'], name=row['title'].replace("'", "''")), axis=1))
print(values_str)

In [33]:
db_cursor.execute(f"""
INSERT INTO detail_brands (brand_id, name)
VALUES {values_str}
    
""")

In [34]:
db_cursor.execute('CREATE EXTENSION IF NOT EXISTS vector')

In [36]:
table_create_command = f"""
CREATE TABLE IF NOT EXISTS {PGVECTOR_COLLECTION_NAME} (
            brand_id SERIAL PRIMARY KEY,
            name VARCHAR(200),
            metadata JSON,
            embedding VECTOR({INDEX_DIMENSIONS})
            );
            """

db_cursor.execute(table_create_command)

In [25]:
def get_embeddings_vector(text: str):
    res = client.embeddings.create(input = [text], model=EMBEDDINGS_MODEL)
    print(f'Generated embeddings for the string "{text[0:100]}", dimensions: {len(res.data[0].embedding)}')
    return res.data[0].embedding

In [26]:
def save_vector(db_cursor, doc, embeds):
    try:
        query = f"""
            INSERT INTO {PGVECTOR_COLLECTION_NAME} (brand_id, name, embedding)
            VALUES ({doc["brand_id"]}, '{doc["title"]}', '{embeds}')
            ON CONFLICT (brand_id)
            DO
                UPDATE SET name = '{doc["title"]}', embedding = '{embeds}'
        """
        
        db_cursor.execute(query)
        print(f'Vector {doc["brand_id"]} was added to the DB.')
        return doc["title"]
    except Exception as e:
        print(f"[save_vector] execption of type {type(e).__name__}: {e}")

In [None]:
for index, row in df.iterrows():
    embeds = get_embeddings_vector(row["title"])
    save_vector(db_cursor, row, embeds)
    

In [11]:
import concurrent.futures

In [12]:
batch_size = 1000

filtered_df = df[df.apply(lambda row: all(isinstance(row[col], str) for col in ['title', ]), axis=1)]
list_of_dicts = filtered_df.to_dict(orient='records')

batches = [list_of_dicts[i:i + batch_size] for i in range(0, len(list_of_dicts), batch_size)]
len(batches)

17

In [None]:
batches

In [None]:

def process_batch(batch):
    for row in batch:
        try:
            embeds = get_embeddings_vector(row["title"])
            save_vector(db_cursor, row, embeds)
        except Exception as exc:
            print(f'Row {row} generated an exception: {exc}')

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}
#     for future in concurrent.futures.as_completed(future_to_batch):
#         pass
    
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit all batches to the executor
    future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}

    # Iterate over the futures as they complete
    for future in concurrent.futures.as_completed(future_to_batch):
        batch = future_to_batch[future]
        try:
            result = future.result()
            # You can do something with the result here
            # e.g., print(result)
        except Exception as exc:
            print(f'Batch generated an exception: {exc}')
        else:
            print(f'Batch processed successfully.')

In [27]:
def get_top_relevant_messages(db_cursor, text, embeddings, k=int(TOP_K)):
    try:
        query = f"""
            WITH vector_matches AS (
                SELECT brand_id, name, embedding <=> '{embeddings}' AS distance
                FROM {PGVECTOR_COLLECTION_NAME}
            )
            SELECT brand_id, name, distance
            FROM vector_matches
            ORDER BY distance
            LIMIT '{k}';
        """
        
        db_cursor.execute(query)
        all_matches = db_cursor.fetchall()
        
        relevant_matches = []
        print('All matches:')
        for doc in all_matches:
            print(f'-- {round(doc[2], 2)}: {doc[1]}')
            
            if round(doc[2], 2) <= float(SIMILARITY_SEARCH_LIMIT):
                relevant_matches.append({
                    "document": doc,
                    "score": doc[2]
                    })

        if len(relevant_matches) == 0:
            print("No relevant matches found")
        else:
            print("Relevant matches: ")
            [print(f'-- {round(doc["score"], 2)}: {doc["document"][2]}') for doc in relevant_matches]
        return relevant_matches
    except Exception as e:
        print(f"[get_top_relevant_messages] {type(e).__name__} exception: {e}")
        return []

In [30]:
test_message = 'ephymess'
get_top_relevant_messages(db_cursor, test_message, get_embeddings_vector(test_message))

Generated embeddings for the string "ephymess", dimensions: 1536
All matches:
-- 0.09: EPHY-MESS
-- 0.17: EIFEM
-- 0.17: Clemessy
Relevant matches: 
-- 0.09: 0.08626945369293293


[{'document': (12033, 'EPHY-MESS', 0.08626945369293293),
  'score': 0.08626945369293293}]