In [4]:
from vec_db import VecDB
import numpy as np 
import os

## **For each size a folder named "saved_db" is generated , zip this folder and upload to drive**

## Generate Indices for : 1M , 5M , 10M 

In [14]:
# set the cpu count to 2 
os.environ["CPU_COUNT"] = "4"


In [13]:
rng = np.random.default_rng(50)
# 10**6 , 5*10**6 , 10**7
vectors = rng.random((10**6,70) , dtype=np.float32)

i = 0
records_dict = [{"id": i, "embed": list(row)} for i, row in enumerate(vectors)]

In [15]:
db = VecDB()
db.insert_records(records_dict) #=> it will take time 

3000
begin clustering
Init 1/1 with method k-means++
Inertia for init 1/1: 1884250.1314875153
[MiniBatchKMeans] Reassigning 50 cluster centers.
Minibatch step 1/1000: mean batch inertia: 6.280387675779807
[MiniBatchKMeans] Reassigning 47 cluster centers.
Minibatch step 2/1000: mean batch inertia: 4.847327382977169, ewa inertia: 4.847327382977169
Minibatch step 3/1000: mean batch inertia: 4.829189858701879, ewa inertia: 4.843699881749612
Minibatch step 4/1000: mean batch inertia: 4.820141534657533, ewa inertia: 4.838988217042861
[MiniBatchKMeans] Reassigning 36 cluster centers.
Minibatch step 5/1000: mean batch inertia: 4.815471248422861, ewa inertia: 4.83428482802225
Minibatch step 6/1000: mean batch inertia: 4.808252913738453, ewa inertia: 4.829078450371869
[MiniBatchKMeans] Reassigning 40 cluster centers.
Minibatch step 7/1000: mean batch inertia: 4.807604091396171, ewa inertia: 4.824783582871596
Minibatch step 8/1000: mean batch inertia: 4.802169105005069, ewa inertia: 4.82026069182

In [16]:
query_seed = 140 
rng_query = np.random.default_rng(query_seed)

query = rng_query.random((1,70) , dtype=np.float32)   

## Test Clusters (50 , 100 , 150) --> I've been using 50 in vec_db and it's fine 
## **Test Before Uploading** 

In [17]:
import time
from dataclasses import dataclass
from typing import List

AVG_OVERX_ROWS = 10


@dataclass
class Result:
    run_time: float
    top_k: int
    db_ids: List[int]
    actual_ids: List[int]


def run_queries(db, np_rows, top_k, num_runs):
    results = []
    for _ in range(num_runs):
        query = np.random.random((1, 70))

        tic = time.time()
        db_ids = db.retrive(query, top_k)
        toc = time.time()
        run_time = toc - tic

        tic = time.time()
        actual_ids = (
            np.argsort(
                np_rows.dot(query.T).T
                / (np.linalg.norm(np_rows, axis=1) * np.linalg.norm(query)),
                axis=1,
            )
            .squeeze()
            .tolist()[::-1]
        )
        toc = time.time()
        np_run_time = toc - tic

        results.append(Result(run_time, top_k, db_ids, actual_ids))
    return results


def eval(results: List[Result]):
    # scores are negative. So getting 0 is the best score.
    scores = []
    run_time = []
    for res in results:
        run_time.append(res.run_time)
        # case for retireving number not equal to top_k, socre will be the lowest
        if len(set(res.db_ids)) != res.top_k or len(res.db_ids) != res.top_k:
            scores.append(-1 * len(res.actual_ids) * res.top_k)
            continue
        score = 0
        for id in res.db_ids:
            try:
                ind = res.actual_ids.index(id)
                if ind > res.top_k * 3:
                    score -= ind
            except:
                score -= len(res.actual_ids)
        scores.append(score)

    return sum(scores) / len(scores), sum(run_time) / len(run_time)

In [20]:
res = run_queries(db, vectors, 5, 1) #=> 100k vectors results on query seed 50 
print(eval(res))

(0.0, 3.6502702236175537)


In [None]:
del db
del vectors