## Note: use the faiss enviornment which is in python 3.7

Otherwise with python3.8 faiss, there will be a bug linking to Intel MKL, which is not even shown in jupyter

In [1]:
import os
import sys

import faiss
import numpy as np

In [2]:
def mmap_fvecs(fname):
    x = np.memmap(fname, dtype='int32', mode='r')
    d = x[0]
    return x.view('float32').reshape(-1, d + 1)[:, 1:]

def mmap_bvecs(fname):
    x = np.memmap(fname, dtype='uint8', mode='r')
    d = x[:4].view('int32')[0]
    return x.reshape(-1, d + 4)[:, 4:]

def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    # Wenqi: Format of ground truth (for 10000 query vectors):
    #   1000(topK), [1000 ids]
    #   1000(topK), [1000 ids]
    #        ...     ...
    #   1000(topK), [1000 ids]
    # 10000 rows in total, 10000 * 1001 elements, 10000 * 1001 * 4 bytes
    return a.reshape(-1, d + 1)[:, 1:].copy()

def fvecs_read(fname):
    return ivecs_read(fname).view('float32')


def choose_train_size(ncentroids):

    # some training vectors for PQ and the PCA
    n_train = 256 * 1000
    n_train = max(n_train, 100 * ncentroids)
    return n_train

In [3]:
dbname = 'SIFT1M'
    
if dbname.startswith('SIFT'):
    # SIFT1M to SIFT1000M
    dbsize = int(dbname[4:-1])
    xb = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_base.bvecs')
    xq = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_query.bvecs')
    gt = ivecs_read('/mnt/scratch/wenqi/Faiss_experiments/bigann/gnd/idx_%dM.ivecs' % dbsize)

    N_VEC = int(dbsize * 1000 * 1000)

    # trim xb to correct size
    xb = xb[:dbsize * 1000 * 1000]

    # Wenqi: load xq to main memory and reshape
    xq = xq.astype('float32').copy()
    xq = np.array(xq, dtype=np.float32)
    gt = np.array(gt, dtype=np.int32)

    print("Vector shapes:")
    print("Base vector xb: ", xb.shape)
    print("Query vector xq: ", xq.shape)
    print("Ground truth gt: ", gt.shape)
else:
    print('unknown dataset', dbname, file=sys.stderr)
    sys.exit(1)

dim = xb.shape[1] # should be 128
nq = xq.shape[0]

Vector shapes:
Base vector xb:  (1000000, 128)
Query vector xq:  (10000, 128)
Ground truth gt:  (10000, 1000)


In [4]:
kbits = 5 # 2^5 = 32
ncentroids = int(2 ** kbits)
M = 1 # only want to do clustering

train_size = choose_train_size(ncentroids)
print("PQ on {} vectors".format(train_size))
xt = xb[:train_size].astype('float32').copy()

PQ on 256000 vectors


In [5]:
index = faiss.IndexPQ(dim, M, kbits)
index.train(xt)

In [46]:
dir(index)

['ST_HE',
 'ST_PQ',
 'ST_SDC',
 'ST_generalized_HE',
 'ST_polysemous',
 'ST_polysemous_generalize',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__swig_destroy__',
 '__weakref__',
 'add',
 'add_c',
 'add_with_ids',
 'add_with_ids_c',
 'assign',
 'assign_c',
 'codes',
 'compute_residual',
 'compute_residual_n',
 'd',
 'do_polysemous_training',
 'encode_signs',
 'get_distance_computer',
 'hamming_distance_histogram',
 'hamming_distance_table',
 'is_trained',
 'metric_arg',
 'metric_type',
 'ntotal',
 'polysemous_ht',
 'polysemous_training',
 'pq',
 'range_search',
 'range_search_c',
 'reconstruct',
 'reconstruct_c',
 'reconstruct_n',
 'reconstruct_n

In [6]:
def get_centroids(index):
    # read the Voronoi cell centroids
    pq = index.pq
    cen = faiss.vector_to_array(pq.centroids)
    cen = cen.reshape(pq.M, pq.ksub, pq.dsub)
    # M = num of sub-quantizers
    # ksub = num clusters of each sub-quantizer
    # dsub = full precision of each sub-quantizer, in this case d (512) / M (32) = 16
    
    return cen

In [7]:
cen = get_centroids(index)

In [13]:
centroids = cen[0]
centroids.shape

(32, 128)

In [14]:
centroids

array([[52.006847 , 20.335617 ,  7.1746573, ...,  5.4246573,  5.6369863,
         8.006849 ],
       [14.486486 , 12.254826 , 11.567568 , ..., 32.46332  , 10.602317 ,
         8.888031 ],
       [11.93007  , 33.923077 , 80.27273  , ..., 19.391607 , 29.832167 ,
        24.174826 ],
       ...,
       [14.319218 ,  9.599348 ,  8.622149 , ...,  7.061889 ,  5.4592834,
        14.625406 ],
       [49.870003 , 23.253334 ,  9.493334 , ..., 21.203333 , 10.43     ,
         9.9366665],
       [22.275    , 16.896875 , 16.765625 , ...,  9.26875  ,  8.76875  ,
         9.2625   ]], dtype=float32)

## Partition the data

In [15]:
def compute_centroid_distances(cluster_centers, query_vecs):
    """
    Input:
        cluster_centers: 2-d array (num_clusters, dim)
        query_vecs: 2-d array (num_queries, dim)
    Output:
        distance_mat (num_queries, num_clusters),
            each element is a distance (L2 square)
    """
    num_clusters, dim = cluster_centers.shape
    nq = query_vecs.shape[0]
    assert dim == query_vecs.shape[1]
    
    distance_mat = np.zeros((nq, num_clusters))
    
    for i in range(num_clusters):
        centroid_replications = np.tile(cluster_centers[i], (nq,1))
        distance_mat[:, i] = np.sum((query_vecs - centroid_replications) ** 2, axis=1)
    
    return distance_mat

def kmeans_predict_sorted(cluster_centers, query_vecs):
    """
    Compute the cell centroid IDs for each query in a sorted manner 
        (increasing distance)
    
    Input:
        cluster_centers: 2-d array (num_clusters, dim)
        query_vecs: 2-d array (num_queries, dim)
    Output:
        ID_mat (num_queries, num_clusters),
            each element is a centroid ID 
    """
    num_clusters, dim = cluster_centers.shape
    nq = query_vecs.shape[0]
    
    distance_mat = compute_centroid_distances(cluster_centers, query_vecs)
    ID_mat = np.argsort(distance_mat, axis=1)
    
    return ID_mat

In [21]:
nb = xb.shape[0]
print(nb)

1000000


In [17]:
# write back all centroids 
centroid_ID_sorted = kmeans_predict_sorted(centroids, xb)
centroid_ID_first = centroid_ID_sorted[:, 0]

In [30]:
print(centroid_ID_sorted.shape)
print(centroid_ID_sorted)
print(centroid_ID_first, centroid_ID_first.shape)

(1000000, 32)
[[21 16 14 ... 18  0 13]
 [16 31 17 ... 18 13 29]
 [21 16 31 ...  0 20 13]
 ...
 [29 27  1 ... 16  7  2]
 [ 4 23 16 ... 22 18 29]
 [13 29 20 ...  2 16  7]]
[21 16 21 ... 29  4 13] (1000000,)


In [38]:
# Create a mapping: partition ID -> {list of vector IDs}
partition_id_vec_id_list_1M = dict()
for i in range(ncentroids):
    partition_id_vec_id_list_1M[i] = []

for i in range(nb):
    partition_ID = int(centroid_ID_first[i])
    partition_id_vec_id_list_1M[partition_ID].append(i)
    
partition_size = [len(partition_id_vec_id_list_1M[i]) for i in range(ncentroids)]
for i in range(ncentroids):
    print('items in partition ', i, partition_size[i], 'average =', int(nb/ncentroids))
print('max size', np.max(partition_size))
print('min size', np.min(partition_size))
print('max / min size', np.max(partition_size) / np.min(partition_size))

items in partition  0 35342 average = 31250
items in partition  1 30844 average = 31250
items in partition  2 19071 average = 31250
items in partition  3 42449 average = 31250
items in partition  4 18408 average = 31250
items in partition  5 30993 average = 31250
items in partition  6 25688 average = 31250
items in partition  7 32268 average = 31250
items in partition  8 33918 average = 31250
items in partition  9 32005 average = 31250
items in partition  10 24843 average = 31250
items in partition  11 22864 average = 31250
items in partition  12 44867 average = 31250
items in partition  13 25499 average = 31250
items in partition  14 26393 average = 31250
items in partition  15 25796 average = 31250
items in partition  16 24805 average = 31250
items in partition  17 23710 average = 31250
items in partition  18 48302 average = 31250
items in partition  19 34569 average = 31250
items in partition  20 30729 average = 31250
items in partition  21 28441 average = 31250
items in partition  

Wenqi: It's more unbalanced compared to k-means

max size 48302
min size 18408
max / min size 2.623967840069535

While k-means on 32 partitions using the same number of training vectors (256K ). 

In [34]:
import heapq

def scan_partition(query_vec, partition_id_list, vector_set):
    """
    query_vec = (128, )
    partition_id_list = (N_num_vec, )
    vector_set = 1M dataset (1M, 128)
    """
    min_dist = 1e10
    min_dist_ID = None
    for vec_id in partition_id_list:
        dataset_vec = vector_set[vec_id]
        dist = np.linalg.norm(query_vec - dataset_vec)
        if dist <= min_dist:
            min_dist = dist
            min_dist_ID = vec_id
            
    return min_dist_ID

In [39]:
nearest_neighbors = []

N = 1000
#### Wenqi: here had a bug: previously xb, now xq

query_partition = kmeans_predict_sorted(centroids, xq[:N])[:, 0]

for i in range(N):
    partition_id = int(query_partition[i])
    nearest_neighbor_ID = scan_partition(xq[i], partition_id_vec_id_list_1M[partition_id], xb)
    nearest_neighbors.append(nearest_neighbor_ID)
    print(i, nearest_neighbor_ID)

0 851705
1 588616
2 869848
3 335355
4 922029
5 508403
6 167240
7 327960
8 572021
9 2392
10 878295
11 353408
12 166452
13 359275
14 368047
15 776345
16 373550
17 163431
18 602078
19 149924
20 988932
21 671173
22 807108
23 436181
24 698614
25 300633
26 922290
27 221028
28 495856
29 785288
30 425493
31 407192
32 295009
33 909787
34 886390
35 825435
36 602485
37 812777
38 341091
39 33749
40 982373
41 499763
42 915089
43 43368
44 9190
45 858815
46 436180
47 283703
48 377991
49 107540
50 849174
51 467756
52 803688
53 74183
54 819365
55 882827
56 535502
57 779586
58 702690
59 12747
60 658574
61 502305
62 795958
63 45494
64 943477
65 715285
66 840644
67 850184
68 239522
69 434096
70 116383
71 753756
72 222336
73 787309
74 436415
75 629700
76 470987
77 245786
78 680004
79 694111
80 659382
81 380662
82 555466
83 228174
84 904722
85 853398
86 299980
87 138747
88 698591
89 845760
90 32380
91 490742
92 526617
93 617987
94 657946
95 330638
96 898436
97 763985
98 532460
99 72670
100 791282
101 88094


765 190828
766 286990
767 367480
768 873450
769 842112
770 850300
771 15926
772 955573
773 364685
774 481365
775 350272
776 173873
777 118452
778 221314
779 554970
780 210716
781 124102
782 366901
783 846066
784 930962
785 262558
786 75426
787 151277
788 268371
789 779803
790 248284
791 145455
792 582759
793 924445
794 712932
795 828717
796 740540
797 730877
798 980926
799 517289
800 360781
801 987659
802 353968
803 400426
804 139296
805 117740
806 453757
807 622769
808 283818
809 364339
810 970904
811 559136
812 190387
813 955520
814 147829
815 205660
816 520919
817 898180
818 817187
819 703036
820 238137
821 342986
822 528733
823 231175
824 5223
825 365903
826 822125
827 274414
828 977449
829 417691
830 36612
831 85623
832 927232
833 78944
834 264686
835 498594
836 796565
837 763906
838 458628
839 961898
840 316040
841 265298
842 171116
843 61173
844 852061
845 519099
846 863005
847 692331
848 444577
849 322370
850 863065
851 3677
852 489485
853 290520
854 519381
855 326462
856 67184

In [40]:
# First 100 queries, 67%
# First 10000 queries, 69.6%

# K-means > 70.09% and the unbalanced factor is lower...

correct_count = 0
for i in range(N):
    if nearest_neighbors[i] == gt[i][0]:
        correct_count += 1
        
print(correct_count, 'recall@1 = ', correct_count / N)

696 recall@1 =  0.696
