In [1]:
import faiss

In [2]:
import numpy as np 

In [3]:
Xt = np.loadtxt('saved_db_1m.csv', delimiter = ',')

In [4]:
data = Xt[:,1:]
data[:1,:]

array([[0.8922525 , 0.2800759 , 0.26163924, 0.46114665, 0.894992  ,
        0.12171966, 0.24928057, 0.52260834, 0.24156952, 0.40916836,
        0.34428424, 0.07163954, 0.03442818, 0.09896666, 0.66252285,
        0.9862883 , 0.92981035, 0.69406784, 0.92269063, 0.44868845,
        0.55067986, 0.6401768 , 0.40450466, 0.2704888 , 0.7013119 ,
        0.30187392, 0.5553066 , 0.07334203, 0.2125904 , 0.05258554,
        0.61584634, 0.8074751 , 0.8789867 , 0.8133053 , 0.92158616,
        0.00497156, 0.7687784 , 0.3300553 , 0.6475257 , 0.08908683,
        0.21998852, 0.4679469 , 0.2687173 , 0.61768943, 0.90623456,
        0.55104804, 0.9345592 , 0.13406181, 0.4424578 , 0.7024428 ,
        0.15089315, 0.6129694 , 0.40907687, 0.46742535, 0.5311658 ,
        0.29712713, 0.7017058 , 0.91839385, 0.13668495, 0.08343965,
        0.2610187 , 0.14476323, 0.39879185, 0.37857074, 0.23424745,
        0.25922382, 0.30125505, 0.6162096 , 0.8325671 , 0.6945543 ]])

## Trial 1 : Using IndexFlatL2 as coarse quantizer and IVFPQ 

In [5]:
quantizer = faiss.IndexFlatL2(70)


In [6]:
index = faiss.IndexIVFPQ(quantizer,70,150,10,8) #=> Quantizer , Dimension , clusters , reduceDimension , nbits

In [7]:
Xtrain = np.random.rand(10000, 70).astype('float32')

In [8]:
index.train(Xtrain)

In [9]:
index.add(data)

In [10]:
Q = np.random.random((1,70))
Q

array([[0.95713852, 0.1255858 , 0.51939283, 0.88571366, 0.02494473,
        0.60542467, 0.5937567 , 0.47759632, 0.88098268, 0.20459794,
        0.60516436, 0.82993367, 0.09323417, 0.03752753, 0.51255918,
        0.31370498, 0.33502925, 0.27021796, 0.68573947, 0.77525896,
        0.24537603, 0.18115012, 0.84654879, 0.17967183, 0.77487567,
        0.63213973, 0.79996122, 0.23676305, 0.9951289 , 0.53076527,
        0.98850615, 0.99930816, 0.95671029, 0.45048102, 0.71455581,
        0.27403173, 0.94583074, 0.75587833, 0.28651601, 0.86268937,
        0.55703547, 0.33321402, 0.44074703, 0.34542888, 0.6041236 ,
        0.78121488, 0.07596347, 0.18537938, 0.39350902, 0.9095119 ,
        0.38322556, 0.33431077, 0.98035071, 0.00379226, 0.33685277,
        0.38391528, 0.54045469, 0.32790782, 0.29777862, 0.69086592,
        0.38967699, 0.635463  , 0.7152153 , 0.19898011, 0.46220471,
        0.6654462 , 0.77986992, 0.56833693, 0.72527947, 0.08238839]])

In [11]:
%%time

dist_x , id_x = index.search(Q,5)

dist_x

CPU times: total: 0 ns
Wall time: 1.01 ms


array([[5.807326 , 6.5578537, 6.6312914, 6.696563 , 6.6993923]],
      dtype=float32)

## Trial 2 : Using hnsw as a quantizer with IVFPQ

In [12]:
quantizer_hnsw = faiss.IndexHNSWFlat(70,10)
index_hnswivfpq = faiss.IndexIVFPQ(quantizer_hnsw, 70, 100, 10, 8)

In [13]:
%%time
index_hnswivfpq.train(Xtrain)
index_hnswivfpq.add(data)

CPU times: total: 17.5 s
Wall time: 2.83 s


In [14]:
%time dists , ids = index_hnswivfpq.search(Q,5)

dists

CPU times: total: 15.6 ms
Wall time: 509 µs


array([[6.230936 , 6.2422013, 6.3266373, 6.3320074, 6.393177 ]],
      dtype=float32)

In [15]:
Q

array([[0.95713852, 0.1255858 , 0.51939283, 0.88571366, 0.02494473,
        0.60542467, 0.5937567 , 0.47759632, 0.88098268, 0.20459794,
        0.60516436, 0.82993367, 0.09323417, 0.03752753, 0.51255918,
        0.31370498, 0.33502925, 0.27021796, 0.68573947, 0.77525896,
        0.24537603, 0.18115012, 0.84654879, 0.17967183, 0.77487567,
        0.63213973, 0.79996122, 0.23676305, 0.9951289 , 0.53076527,
        0.98850615, 0.99930816, 0.95671029, 0.45048102, 0.71455581,
        0.27403173, 0.94583074, 0.75587833, 0.28651601, 0.86268937,
        0.55703547, 0.33321402, 0.44074703, 0.34542888, 0.6041236 ,
        0.78121488, 0.07596347, 0.18537938, 0.39350902, 0.9095119 ,
        0.38322556, 0.33431077, 0.98035071, 0.00379226, 0.33685277,
        0.38391528, 0.54045469, 0.32790782, 0.29777862, 0.69086592,
        0.38967699, 0.635463  , 0.7152153 , 0.19898011, 0.46220471,
        0.6654462 , 0.77986992, 0.56833693, 0.72527947, 0.08238839]])

## Statistics and Comparison between cosine similarity of Q and the vectors in data  

### Brute-Force cos-similarity 

In [16]:
b = Q[0]
a3la_cos_sim = []
for idx in range(90000):
    a = data[idx,:].reshape(1,-1)
    xos = np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b)))
    if xos > 0.85:
        a3la_cos_sim.append(xos)
        print(xos, idx)

[0.85150352] 52
[0.85503787] 415
[0.8672097] 624
[0.87106988] 1535
[0.8567513] 1906
[0.85045508] 3197
[0.85154939] 3798
[0.85104825] 4114
[0.85236704] 4233
[0.85266526] 4435
[0.86523382] 4563
[0.85438514] 4618
[0.85527604] 5076
[0.8786697] 6400
[0.85963828] 6877
[0.85496844] 8216
[0.85690809] 8493
[0.87311338] 8790
[0.86088666] 9344
[0.85569788] 10103
[0.85325722] 10784
[0.87670609] 10956
[0.85319847] 11440
[0.85171471] 11461
[0.86865618] 11674
[0.85111135] 12333
[0.85929665] 12447
[0.86174366] 13055
[0.85500407] 14540
[0.85817766] 14668
[0.85091413] 15035
[0.86700805] 15195
[0.86087364] 15451
[0.85105519] 15865
[0.86009587] 15910
[0.86244308] 16431
[0.85877755] 16560
[0.85122545] 17413
[0.85996312] 17982
[0.85466983] 18523
[0.85766158] 18870
[0.85527524] 19054
[0.85518275] 19926
[0.85936165] 20299
[0.85226298] 21782
[0.86150949] 21802
[0.86655627] 21845
[0.85497382] 23358
[0.86847179] 24896
[0.85249693] 25775
[0.85550188] 26848
[0.85360184] 27208
[0.85020815] 29168
[0.88458609] 30652


In [17]:
np.max(a3la_cos_sim)

0.884586086596255

## Ids extracted from faiss hnswivfpq as nearest distance

In [18]:
print(id_x) #=> ivfpq no hnsw
b = Q[0]
for idx in id_x[0]:
    a = data[idx,:].reshape(1,-1)
    print(np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b))))

[[43867 11725 21782 79301 29006]]
[0.8534924]
[0.84021952]
[0.85226298]
[0.85124863]
[0.84639703]


In [19]:
print(ids) #=> with hnsw

b = Q[0]
for idx in ids[0]:
    a = data[idx,:].reshape(1,-1)
    print(np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b))))

[[ 5941 51518 42418 59393 73201]]
[0.84689645]
[0.84546353]
[0.8537365]
[0.86932]
[0.84627526]


## PQ

In [20]:
import nanopq

In [21]:
N, Nt, D = 10000, 5000, 70
# X = np.random.random((N, D)).astype(np.float32)  # 10,000 128-dim vectors to be indexed
Xt = np.random.random((Nt, D)).astype(np.float32)  # 2,000 128-dim vectors for training
query = np.random.random((D,)).astype(np.float32)  # a 128-dim query vector

# Instantiate with M=8 sub-spaces
pq = nanopq.PQ(M=7)

# Train codewords
pq.fit(Xt)

# Encode to PQ-codes
X_code = pq.encode(data.astype(np.float32))  # (90000, 7) with dtype=np.uint8

# Results: create a distance table online, and compute Asymmetric Distance to each PQ-code 
dists = pq.dtable(query).adist(X_code)  # (90000, ) 

M: 7, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 7
Training the subspace: 1 / 7
Training the subspace: 2 / 7
Training the subspace: 3 / 7
Training the subspace: 4 / 7
Training the subspace: 5 / 7
Training the subspace: 6 / 7
Encoding the subspace: 0 / 7
Encoding the subspace: 1 / 7
Encoding the subspace: 2 / 7
Encoding the subspace: 3 / 7
Encoding the subspace: 4 / 7
Encoding the subspace: 5 / 7
Encoding the subspace: 6 / 7


In [22]:
dists

array([ 8.768163,  9.334701, 10.043791, ...,  9.517483,  8.785688,
       10.095693], dtype=float32)

## IVF_Flat    WINNER WINNER CHICKEN DINNER

In [60]:
from memory_profiler import memory_usage


def memory_usage_run_queries(args):
    # This part is added to calcauate the RAM usage
    mem_before = max(memory_usage())
    mem = memory_usage(proc=(run_queries, args, {}), interval = 1e-3)
    return max(mem) - mem_before




In [61]:
# Generating random data (1M vectors with dimension 70)
np.random.seed(42)
Xt = np.random.rand(10000, 70).astype('float32')

# Parameters
d = 70  # Dimension of the vectors
nlist = 100  # Number of clusters
quantizer = faiss.IndexFlatL2(d)  # Flat index for clustering

# Create IndexIVFFlat
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

index.train(Xt)
index.add(data)

# Save the index
# faiss.write_index(index, "index_ivf_flat.index")


In [62]:
def run_queries(Q,K):
    dists_flat , ids_flat = index.search(Q,K)
    return dists_flat, ids_flat

memory_usage_run_queries((Q,40))

0.01953125

In [31]:
dists_flat # L2 distance 

array([[5.0295053, 5.490045 , 5.491939 , 5.6248503, 5.669506 , 5.727092 ,
        5.7533965, 5.8510523, 5.872204 , 5.875067 ]], dtype=float32)

In [32]:
b = Q[0] #cosine similarity
for idx in ids_flat[0]:
    a = data[idx,:].reshape(1,-1)
    print(np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b))))

[0.8687639]
[0.87137475]
[0.85104825]
[0.86853004]
[0.82786012]
[0.8567513]
[0.85569788]
[0.84233067]
[0.83288655]
[0.82407405]


## Index IVFPQ only

In [26]:
# Generating random data (1M vectors with dimension 70)
np.random.seed(42)
Xt = np.random.rand(10000, 70).astype('float32')

# Parameters
d = 70  # Dimension of the vectors
nlist = 100  # Number of clusters
m = 7  # Number of subquantizers
nbits = 8  # Number of bits for quantization

# Create IndexIVFPQ
index = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, nlist, m, nbits)

# Train the index
index.train(Xt)
index.add(data)

dists_ivfpq , ids_ivfpq = index.search(Q,5)



# Save the index
#faiss.write_index(index, "index_ivf_pq.index")


In [27]:
dists_ivfpq

array([[5.0295053, 5.490045 , 5.491939 , 5.6248503, 5.669506 ]],
      dtype=float32)

In [28]:
b = Q[0]
for idx in ids_ivfpq[0]:
    a = data[idx,:].reshape(1,-1)
    print(np.dot(a, b)/(np.linalg.norm(a)*(np.linalg.norm(b))))

[0.8687639]
[0.87137475]
[0.85104825]
[0.86853004]
[0.82786012]


## IVF only 

In [29]:
# Generating random data (1M vectors with dimension 70)
np.random.seed(42)
Xt = np.random.rand(2000, 70).astype('float32')

# Parameters
d = 70  # Dimension of the vectors
nlist = 100  # Number of clusters
quantizer = faiss.IndexFlatL2(d)  # Flat index for clustering

# Create IndexIVF
index = faiss.IndexIVF(quantizer, d, nlist, faiss.METRIC_L2)

# Train the index
index.train(Xt)
index.add(data)

# Save the index
# faiss.write_index(index, "index_ivf.index")


AttributeError: No constructor defined - class is abstract