In [1]:
from vec_db import VecDB
import numpy as np 

## **For each size a folder named "saved_db" is generated , zip this folder and upload to drive**

## Generate Indices for : 1M , 5M , 10M 

In [2]:
rng = np.random.default_rng(200)
# 10**6 , 5*10**6 , 10**7
vectors = rng.random((10**5,70) , dtype=np.float32)

i = 0
records_dict = [{"id": i, "embed": list(row)} for i, row in enumerate(vectors)]

In [3]:
db = VecDB()
db.insert_records(records_dict) #=> it will take time 

951
begin clustering
Init 1/1 with method k-means++
Inertia for init 1/1: 662284.4969350238
Minibatch step 1/100: mean batch inertia: 6.62172513657306
Minibatch step 2/100: mean batch inertia: 4.933457012632148, ewa inertia: 4.933457012632148
Minibatch step 3/100: mean batch inertia: 4.902444222116019, ewa inertia: 4.902444222116019
Minibatch step 4/100: mean batch inertia: 4.888418168854718, ewa inertia: 4.888418168854718
Minibatch step 5/100: mean batch inertia: 4.876155926181821, ewa inertia: 4.876155926181821
Minibatch step 6/100: mean batch inertia: 4.870201777026597, ewa inertia: 4.870201777026597
Minibatch step 7/100: mean batch inertia: 4.863474451571094, ewa inertia: 4.863474451571094
Minibatch step 8/100: mean batch inertia: 4.8637294312134935, ewa inertia: 4.8637294312134935
Minibatch step 9/100: mean batch inertia: 4.85888373738964, ewa inertia: 4.85888373738964
Minibatch step 10/100: mean batch inertia: 4.858281574230385, ewa inertia: 4.858281574230385
Minibatch step 11/10

In [9]:
query_seed = 50 
rng_query = np.random.default_rng(query_seed)

query = rng_query.random((1,70) , dtype=np.float32)   

## Test Clusters (50 , 100 , 150) --> I've been using 50 in vec_db and it's fine 
## **Test Before Uploading** 

In [5]:
import time
from dataclasses import dataclass
from typing import List

AVG_OVERX_ROWS = 10


@dataclass
class Result:
    run_time: float
    top_k: int
    db_ids: List[int]
    actual_ids: List[int]


def run_queries(db, np_rows, top_k, num_runs):
    results = []
    for _ in range(num_runs):
        query = np.random.random((1, 70))

        tic = time.time()
        db_ids = db.retrive(query, top_k)
        toc = time.time()
        run_time = toc - tic

        tic = time.time()
        actual_ids = (
            np.argsort(
                np_rows.dot(query.T).T
                / (np.linalg.norm(np_rows, axis=1) * np.linalg.norm(query)),
                axis=1,
            )
            .squeeze()
            .tolist()[::-1]
        )
        toc = time.time()
        np_run_time = toc - tic

        results.append(Result(run_time, top_k, db_ids, actual_ids))
    return results


def eval(results: List[Result]):
    # scores are negative. So getting 0 is the best score.
    scores = []
    run_time = []
    for res in results:
        run_time.append(res.run_time)
        # case for retireving number not equal to top_k, socre will be the lowest
        if len(set(res.db_ids)) != res.top_k or len(res.db_ids) != res.top_k:
            scores.append(-1 * len(res.actual_ids) * res.top_k)
            continue
        score = 0
        for id in res.db_ids:
            try:
                ind = res.actual_ids.index(id)
                if ind > res.top_k * 3:
                    score -= ind
            except:
                score -= len(res.actual_ids)
        scores.append(score)

    return sum(scores) / len(scores), sum(run_time) / len(run_time)

In [10]:
res = run_queries(db, vectors, 5, 1) #=> 100k vectors results on query seed 50 
print(eval(res))

(0.0, 0.7171332836151123)


In [None]:
del db
del vectors