# Database Testing

Sample functionality for creating tables, inserting data and running similarity search with OgbujiPT.

Notes:
- `pip install jupyter` if notebook is not running

In [1]:
from sentence_transformers import SentenceTransformer
from ogbujipt.embedding_helper import PGvectorConnection

# Load the model
e_model = SentenceTransformer('all-mpnet-base-v2')

# Demo data
pacer_copypasta = [
    'The FitnessGram™ Pacer Test is a multistage aerobic capacity test that progressively gets more difficult as it continues.', 
    'The 20 meter pacer test will begin in 30 seconds. Line up at the start.', 
    'The running speed starts slowly, but gets faster each minute after you hear this signal.', 
    '[beep] A single lap should be completed each time you hear this sound.', 
    '[ding] Remember to run in a straight line, and run as long as possible.', 
    'The second time you fail to complete a lap before the sound, your test is over.', 
    'The test will begin on the word start. On your mark, get ready, start.'
]

## Connecting to the database

In [2]:
try:
    print("Connecting to database...")
    vDB = await PGvectorConnection.create(
        embedding_model=e_model, 
        user='oori', 
        password='example', 
        db_name='PGv', 
        host='sofola', 
        port=int('5432')
        )
    print("Connected to database.")
except Exception as e:
    print(f"Error connecting to database: {e}")

Connecting to database...
Connected to database.


## Setting up database

In [4]:
# Ensuring that the vector extension is installed
try:
    await vDB.conn.execute('''CREATE EXTENSION IF NOT EXISTS vector;''')
    print("PGvector extension created and loaded.")
except Exception as e:
    print(f"Error creating extension: {e}")

# XXX Dropping the table if it exists (for testing purposes, ideally we would want to update the table)
try:
    await vDB.conn.execute('''DROP TABLE IF EXISTS embeddings;''')
    print("Table dropped.")
except Exception as e:
    print(f"Error dropping table: {e}")

# Creating a new table
try:
    await vDB.create_doc_table(table_name='embeddings')
    print("Table created.")
except Exception as e:
    print(f"Error creating table: {e}")

PGvector extension created and loaded.
Table dropped.
Table created.


## Inserting data

In [5]:
# Ensuring that the vector extension is installed
try:
    await vDB.conn.execute('''CREATE EXTENSION IF NOT EXISTS vector;''')
    print("PGvector extension created and loaded.")
except Exception as e:
    print(f"Error creating extension: {e}")

# XXX Dropping the table if it exists (for testing purposes, ideally we would want to update the table)
try:
    await vDB.conn.execute('''DROP TABLE IF EXISTS embeddings;''')
    print("Table dropped.")
except Exception as e:
    print(f"Error dropping table: {e}")

# Creating a new table
try:
    await vDB.create_doc_table(table_name='embeddings')
    print("Table created.")
except Exception as e:
    print(f"Error creating table: {e}")

Inserting data into table...
Data inserted.


## Similarity search

In [6]:
# Setting K for the search
k = 3

In [7]:
# Searching the table with a perfect match
search_string = '[beep] A single lap should be completed each time you hear this sound.'
print('Semantic Searching data...')
print('using search string:', search_string)

try:
    sim_search = await vDB.search_doc_table(table_name='embeddings', query_string=search_string, k=k)
except Exception as e:
    print(f"Error searching table: {e}")

print('RETURNED Title:', sim_search[0]['title'])
print('RETURNED Content:', sim_search[0]['content'])
print('RETURNED Cosine Similarity:', f'{sim_search[0]["cosine_similarity"]:.2f}')
print('RAW RETURN:', sim_search)

Semantic Searching data...
using search string: [beep] A single lap should be completed each time you hear this sound.
RETURNED Title: Pacer Copypasta line 3
RETURNED Content: [beep] A single lap should be completed each time you hear this sound.
RETURNED Cosine Similarity: 1.00
RAW RETURN: [<Record cosine_similarity=1.0 title='Pacer Copypasta line 3' content='[beep] A single lap should be completed each time you hear this sound.'>, <Record cosine_similarity=0.6023784459346072 title='Pacer Copypasta line 5' content='The second time you fail to complete a lap before the sound, your test is over.'>, <Record cosine_similarity=0.5462966290889161 title='Pacer Copypasta line 4' content='[ding] Remember to run in a straight line, and run as long as possible.'>]


In [8]:
# Searching the table with a partial match
search_string = 'Straight'
print('Semantic Searching data...')
print('using search string:', search_string)
try:
    sim_search = await vDB.search_doc_table(table_name='embeddings', query_string=search_string, k=k)
except Exception as e:
    print(f"Error searching table: {e}")
print('RETURNED Title:', sim_search[0]['title'])
print('RETURNED Content:', sim_search[0]['content'])
print('RETURNED Cosine Similarity:', f'{sim_search[0]["cosine_similarity"]:.2f}')
print('RAW RETURN:', sim_search)

Semantic Searching data...
using search string: Straight
RETURNED Title: Pacer Copypasta line 4
RETURNED Content: [ding] Remember to run in a straight line, and run as long as possible.
RETURNED Cosine Similarity: 0.19
RAW RETURN: [<Record cosine_similarity=0.19225941254254653 title='Pacer Copypasta line 4' content='[ding] Remember to run in a straight line, and run as long as possible.'>, <Record cosine_similarity=0.05218006236434358 title='Pacer Copypasta line 6' content='The test will begin on the word start. On your mark, get ready, start.'>, <Record cosine_similarity=0.026942850765159787 title='Pacer Copypasta line 3' content='[beep] A single lap should be completed each time you hear this sound.'>]
