In [None]:
import psycopg2
import time
import statistics
from datasets import load_dataset
from config import load_config

def create_table():
    cmd = """ CREATE TABLE IF NOT EXISTS bookcorpus (
        sentence_id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
        sentence TEXT NOT NULL
        )"""
    try:
        config = load_config()
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                    cur.execute(cmd)
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

    print("Table bookcorpus created.")

def insert_sentences(ds_chunk):
    sql = """INSERT INTO bookcorpus (sentence) VALUES (%s)"""
    text_chunks = []
    for example in ds_chunk:
        text = example["text"]
        text_chunks.append((text,))
    try:
        config = load_config()
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                cur.executemany(sql, text_chunks)
            conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

if __name__ == '__main__':
    create_table()
    times = []
    ds = load_dataset("williamkgao/bookcorpus100mb", split="train")
    chunk_size = 10000
    for i in range(0, len(ds), chunk_size):
        start_time = time.time()
        chunk = ds.select(range(i, min(i + chunk_size, len(ds))))
        insert_sentences(chunk)
        end_time = time.time()
        times.append(end_time - start_time)
    
    print("Finished storing textual data.")
    min_time = min(times)
    max_time = max(times)
    avg_time = sum(times) / len(times)
    std_dev = statistics.stdev(times) if len(times) > 1 else 0
    print(f"Minimum time: {min_time:.6f} seconds")
    print(f"Maximum time: {max_time:.6f} seconds")
    print(f"Average time: {avg_time:.6f} seconds")
    print(f"Standard deviation: {std_dev:.6f} seconds")
    

In [16]:
import psycopg2
import time
import statistics
from sentence_transformers import SentenceTransformer
from config import load_config

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
times = []

def create_embeddings_table():
    cmd = """ CREATE TABLE IF NOT EXISTS embeddings (
        id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
        vec double precision[]
        )"""
    try:
        config = load_config()
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                    cur.execute(cmd)
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)
    finally:
        print("Table embeddings created.")

def querys_to_embeddings():
    chunk_size = 10000
    sql = """SELECT sentence FROM bookcorpus ORDER BY sentence_id ASC;"""
    insert_sql = """INSERT INTO embeddings (vec) VALUES (%s);"""
    count = 0
    try:
        config = load_config()
        with psycopg2.connect(**config) as read_conn:
            read_conn.autocommit = True
            with read_conn.cursor() as read_cursor:
                read_cursor.execute(sql)
                with psycopg2.connect(**config) as write_conn:
                    with write_conn.cursor() as write_cursor:
                        while True:
                            rows = read_cursor.fetchmany(chunk_size)
                            if not rows:
                                break
                            sentences = [row[0] for row in rows]
                            count += len(sentences)
                            embeddings = model.encode(sentences)
                            emb_to_insert = [(embedding.tolist(),) for embedding in embeddings]
                            start_time = time.time()
                            write_cursor.executemany(insert_sql, emb_to_insert)
                            write_conn.commit()
                            end_time = time.time()
                            times.append(end_time - start_time)
                             
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

    print(f"Finished storing the {count} embeddings.")

def perfomance():
    min_time = min(times)
    max_time = max(times)
    avg_time = sum(times) / len(times)
    std_dev = statistics.stdev(times) if len(times) > 1 else 0
    print(f"Minimum time: {min_time:.6f} seconds")
    print(f"Maximum time: {max_time:.6f} seconds")
    print(f"Average time: {avg_time:.6f} seconds")
    print(f"Standard deviation: {std_dev:.6f} seconds")

if __name__ == '__main__':
    create_embeddings_table()
    querys_to_embeddings()
    perfomance()


Table embeddings created.
Finished storing the 1527753 embeddings.
Minimum time: 8.627728 seconds
Maximum time: 11.970323 seconds
Average time: 11.211960 seconds
Standard deviation: 0.323633 seconds
