# One Million Documents: Benchmark

In [1]:
import svs
import pandas as pd
import os
import random
import string
import numpy as np

In [2]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)

In [3]:
N = 1_000_000

# Step 1: Create Synthetic Data

In [4]:
async def fake_embeddings(list_of_strings):
    n = len(list_of_strings)
    m = 1536   # <-- same as OpenAI's `text-embedding-3-small` embedding dimensionality
    v = np.random.random(size=(n, m))
    mags = np.sqrt((v * v).sum(axis=1)).reshape(n, 1)
    return (v / mags).tolist()

In [5]:
%%time

letters = list(string.ascii_letters)

def rand_word():
    n = random.randint(1, 5)
    return ''.join(random.choice(letters) for _ in range(n))

def rand_document():
    n = random.randint(100, 500)
    return ' '.join(rand_word() for _ in range(n))

rand_docs = [rand_document() for _ in range(N)]

len(rand_docs), np.mean([len(doc) for doc in rand_docs])

CPU times: user 10min 10s, sys: 1.78 s, total: 10min 12s
Wall time: 10min 21s


(1000000, np.float64(1199.320602))

# Step 2: Insert into Knowledge Base

In [6]:
DB_FILE_PATH = './onemillion.sqlite'

kb = svs.KB(DB_FILE_PATH, fake_embeddings, force_fresh_db=True)



In [7]:
with kb.bulk_add_docs() as add_doc:
    for doc in rand_docs:
        add_doc(doc)

2024-07-28 00:24:46,202 - svs.kb - INFO - starting bulk-add (as new database transaction)
2024-07-28 00:24:53,793 - svs.kb - INFO - getting 1000000 document embeddings...
2024-07-28 00:27:50,351 - svs.kb - INFO - *DONE*: got 1000000 document embeddings
2024-07-28 00:27:50,351 - svs.kb - INFO - invalidating cached vectors; they'll be re-built next time you `retrieve()`
2024-07-28 00:27:50,351 - svs.kb - INFO - ending bulk-add (committing the database transaction)


In [8]:
kb.close()

2024-07-28 00:27:53,916 - svs.kb - INFO - invalidating cached vectors; they'll be re-built next time you `retrieve()`


# Step 3: Retrieve Top-100 Documents

In [9]:
kb = svs.KB(DB_FILE_PATH, fake_embeddings)

len(kb)



1000000

In [10]:
%%time

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!! THIS IS THE FIRST RETRIEVAL; IT WILL BE SLOW BECAUSE THE VECTORS HAVE TO BE LOADED FROM DISK INTO RAM !!!
# !!!                         (SUBSEQUENT RETRIEVALS WILL BE FASTER)                                        !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

_ = kb.retrieve('pizza', n=100)

2024-07-28 00:27:57,506 - svs.kb - INFO - retrieving 100 documents with query string: pizza
2024-07-28 00:27:57,509 - svs.kb - INFO - re-building cached vectors...
2024-07-28 00:29:36,220 - svs.kb - INFO - re-building cached vectors... DONE!
2024-07-28 00:29:36,221 - svs.kb - INFO - got embedding for query!
2024-07-28 00:29:36,473 - svs.kb - INFO - computed 1000000 cosine similarities
2024-07-28 00:29:36,479 - svs.kb - INFO - retrieved top 100 documents


CPU times: user 1min 33s, sys: 5.66 s, total: 1min 38s
Wall time: 1min 38s


In [13]:
%%time

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!! THIS SUBSEQUENT RETRIEVAL WILL BE FAST BECAUSE IT WILL USE THE CACHED VECTORS ALREADY IN RAM !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

_ = kb.retrieve('taco', n=100)

2024-07-28 00:29:46,840 - svs.kb - INFO - retrieving 100 documents with query string: taco
2024-07-28 00:29:46,844 - svs.kb - INFO - using cached vectors
2024-07-28 00:29:46,848 - svs.kb - INFO - got embedding for query!
2024-07-28 00:29:47,085 - svs.kb - INFO - computed 1000000 cosine similarities
2024-07-28 00:29:47,087 - svs.kb - INFO - retrieved top 100 documents


CPU times: user 783 ms, sys: 2.05 ms, total: 786 ms
Wall time: 248 ms


In [14]:
kb.close()

2024-07-28 00:29:49,401 - svs.kb - INFO - invalidating cached vectors; they'll be re-built next time you `retrieve()`
