## Dataset Preparation for DiskANN

In [1]:
import numpy as np
import sys
import os

embeddings_root = 'path/to/embeddings/'

def generate_bin_data_from_ndarray(embedding_path, bin_out_path, embedding_dims):
    data_orig = np.load(embedding_path)
    for d in embedding_dims:
        data_sliced = data_orig[:, :d]
        outfile = bin_out_path+"_d"+str(d)+".fbin"
        print(outfile.split("/")[-1])
        print("Array sliced: ", data_sliced.shape)
        data_sliced.astype('float32').tofile("temp")

        num_points = data_sliced.shape[0].to_bytes(4, 'little')
        data_dim = data_sliced.shape[1].to_bytes(4, 'little')

        with open("temp", "rb") as old, open(outfile, "wb") as new:
            new.write(num_points)
            new.write(data_dim)
            new.write(old.read())
        
        os.remove("temp")

In [None]:
nesting_list = [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
for d in nesting_list:
    generate_bin_data_from_ndarray(embeddings_root+"1K_train_mrl0_e0_ff"+str(d)+"-X.npy", "../build/data/rr-resnet50/fbin/database", [d])
    print()
    generate_bin_data_from_ndarray(embeddings_root+"1K_val_mrl0_e0_ff"+str(d)+"-X.npy", "../build/data/rr-resnet50/fbin/queries", [d])

## Generate Exact Search ground truth from queries and database

In [2]:
%%bash
use_mrl=mr # mr or rr 

for d in 8 16 32 64 128 256 512 1024 2048
do
    ./../build/tests/utils/compute_groundtruth  --data_type float --dist_fn l2 \
    --base_file ../build/data/{use_mrl}-resnet50/fbin/database_d$d.fbin \
    --query_file  ../build/data/{use_mrl}-resnet50/fbin/queries_d$d.fbin \
    --gt_file ../build/data/{use_mrl}-resnet50/exact_gt100/${use_mrl}_r50_queries_d$d""_gt100 --K 100
done

## Build DiskANN In-Memory Index

In [None]:
%%bash
opq_bytes=32
use_mrl=mrl # mr or rr

for use_mrl in mrl
do
    for d in 32 64 128 256 512 1024 2048
    do
        echo -e "Building index ${use_mrl}1K_opq${opq_bytes}_R64_L100_A1.2_d$d\n"
        ./../build/tests/build_memory_index --data_type float --dist_fn l2 \
        --data_path ../build/data/${use_mrl}-resnet50/fbin/database_d$d.fbin \
        --index_path_prefix ../build/data/${use_mrl}-resnet50/memory-index/${use_mrl}1K_opq${opq_bytes}_R64_L100_A1.2_d$d \
        -R 64 -L 100 --alpha 1.2 --build_PQ_bytes ${opq_bytes} --use_opq
    done
done

## Build DiskANN SSD Index

In [None]:
%%bash
opq_bytes=48
use_mrl=rr
reorder=disk-index-no-reorder

# Disable post-hoc re-ranking by setting PQ_disk_bytes = build_PQ_bytes
for opq_bytes in 32 48 64
do
    for d in 1024
    do
        echo -e "Building disk OPQ index ${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d\n"
        ./../build/tests/build_disk_index --data_type float --dist_fn l2 \
        --data_path ../build/data/${use_mrl}-resnet50/fbin/database_d$d.fbin \
        --index_path_prefix ../build/data/${use_mrl}-resnet50/disk-index-no-reorder/${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d \
        -R 64 -L 100 -B 0.3 -M 40 --PQ_disk_bytes $opq_bytes --build_PQ_bytes $opq_bytes --use_opq 
    done
done

# Build index with implicit post-hoc full-precision reranking
for opq_bytes in 32 48 64
do
    for d in 128 1024
    do
        ./../build/tests/build_disk_index --data_type float --dist_fn l2 \
        --data_path ../build/data/${use_mrl}-resnet50/fbin/database_d$d.fbin \
        --index_path_prefix ../build/data/${use_mrl}-resnet50/disk-index/${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d \
        -R 64 -L 100 -B 0.3 -M 40 --build_PQ_bytes $opq_bytes --use_opq 
        echo -e "Build index ${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d\n"
    done
done

## Search DiskANN Memory Index

In [1]:
%%bash
opq_bytes=32

for use_mrl in rr mr
do
    for d in 32 64 128 256 512 1024 2048
    do
        ./../build/tests/search_memory_index --data_type float --dist_fn l2 \
        --index_path_prefix ../build/data/${use_mrl}-resnet50/memory-index/${use_mrl}1K_opq${opq_bytes}_R64_L100_A1.2_d$d \
        --query_file  ../build/data/${use_mrl}-resnet50/fbin/queries_d$d.fbin \
        --gt_file ../build/data/${use_mrl}-resnet50/exact_gt100/mrlr50_queries_d$d""_gt100 \
        -K 100 -L 100 --result_path ../build/data/${use_mrl}-resnet50/res/memory-index/d$d/opq${opq_bytes}
        echo -e "Searched index ${use_mrl}1K_opq${opq_bytes}_R64_L100_A1.2_d$d\n"
    done
done

## Search DiskANN SSD Index

In [None]:
%%bash
opq_bytes=48
use_mrl=mrl
reorder=disk-index

for d in 1024
do
    for W in 2 8 16 32 # search quality
    do
        ./../build/tests/search_disk_index --data_type float --dist_fn l2 \
        --index_path_prefix ../build/data/${use_mrl}-resnet50/${reorder}/${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d \
        --query_file  ../build/data/${use_mrl}-resnet50/fbin/queries_d$d.fbin \
        --gt_file ../build/data/${use_mrl}-resnet50/exact_gt100/mrlr50_queries_d$d""_gt100 \
        -K 100 -L 100 -W ${W} --num_nodes_to_cache 100000 --result_path ../build/data/${use_mrl}-resnet50/res/${reorder}/d$d/opq${opq_bytes}_W$W
        echo -e "Searched index ${use_mrl}1K_opq${opq_bytes}_R64_L100_B0.3_d$d\n"
    done
done

# DiskANN Eval

In [1]:
D = 2048
CONFIG = 'mr' # ['mr', 'rr']
NESTING = CONFIG == 'mr'
DISKANN_INDEX = 'memory-index' # disk-index
DATASET = '1K' # ['1K', '4K', 'V2']

In [2]:
def compute_mAP_recall_at_k(val_classes, db_classes, neighbors, k):
    APs = list()
    precision, recall, topk, majvote, unique_cls = [], [], [], [], []
    
    for i in range(val_classes.shape[0]): # Compute precision for each vector's list of k-nn
        target = val_classes[i]
        indices = neighbors[i, :][:k]    # k neighbor list for ith val vector
        labels = db_classes[indices]
        matches = (labels == target)
        
        # Number of unique classes
        unique_cls.append(len(np.unique(labels)))
        
        # topk
        hits = np.sum(matches)
        if hits>0:
            topk.append(1)
        else:
            topk.append(0)
            
        # true positive counts
        tps = np.cumsum(matches)

        # recall
        recall.append(np.sum(matches)/1300)
        precision.append(np.sum(matches)/k)

        # precision values
        precs = tps.astype(float) / np.arange(1, k + 1, 1)
        APs.append(np.sum(precs[matches.squeeze()]) / k)

    return np.mean(APs), np.mean(precision), np.mean(recall), np.mean(topk), majvote, np.mean(unique_cls)

In [3]:
def print_metrics(CONFIG, nesting_list, shortlist, metric, nprobe=1):
    if NESTING:
        # Database: 1.2M x 1 for imagenet1k
        db_labels = np.load(embeddings_root + DATASET + "_train_mrl1_e0_ff2048-y.npy")
        
        # Query set: 50K x 1 for imagenet1k
        query_labels = np.load(embeddings_root + DATASET + "_val_mrl1_e0_ff2048-y.npy")
    
    for dim in nesting_list:
        if opq > dim:
            continue
        # Load database and query set for fixed feature models
        if not NESTING:
            db_labels = np.load(embeddings_root + DATASET + "_train_mrl1_e0_ff2048-y.npy")
            query_labels = np.load(embeddings_root + DATASET + "_val_mrl0_e0_ff"+str(D)+"-y.npy")
        
        for W in [32]:
            row = [dim, opq, W]
            fileName = f'/home/jupyter/DiskANN/build/data/{CONFIG}-resnet50/res/{DISKANN_INDEX}/d{dim}/opq{opq}_100_idx_uint32.bin'
            print(fileName)
            with open(fileName, 'rb') as f:
                data = np.fromfile(f, dtype='<u4')
            neighbors = data[2:].reshape(50000,-1)
            ### DiskANN
            
            for k in shortlist:
                mAP, precision, recall, topk, majvote, unique_cls = compute_mAP_recall_at_k(query_labels, db_labels, neighbors, k)
                
                if (metric == 'topk'): row.append(f'{metric}@{k}') ; row.append(topk)
                elif (metric == 'mAP'): row.append(f'{metric}@{k}'); row.append(mAP)
                elif (metric == 'precision'): row.append(f'{metric}@{k}'); row.append(precision)
                elif (metric == 'recall') : row.append(f'{metric}@{k}'); row.append(recall)
                elif (metric == 'unique_cls'): row.append(f'{metric}@{k}'); row.append(unique_cls)
                else: print("Unsupported metric!")

            print(row)

In [None]:
# Memory OPQ64
nesting_list = [64]
opq = 64
print(["dim", "opq", "beamwidth", "metric", "value"])
print_metrics(CONFIG, nesting_list, [1], 'topk')
print_metrics(CONFIG, nesting_list, [10], 'mAP')
print_metrics(CONFIG, nesting_list, [40], 'recall')

In [None]:
## SSD no reranking index
nesting_list = [32, 64, 128, 256, 512, 1024] # vector dims
print(["dim", "opq", "beamwidth", "metric", "value"])

for opq in [32, 64]:    
    print_metrics(CONFIG, nesting_list, [1], 'topk')
    print_metrics(CONFIG, nesting_list, [10], 'mAP')
    print_metrics(CONFIG, nesting_list, [40], 'recall')

## Post-hoc reranking

In [11]:
import numpy as np 
from sklearn.preprocessing import normalize

D = 2048
CONFIG = 'mr' # ['mr/', 'ff/']
NESTING = CONFIG == 'mr'
DISKANN_INDEX = 'memory-index'
DATASET = '1K' # ['1K', '4K', 'V2']

dim = 2048
opq = 64
db_rerank = np.load(embeddings_root+DATASET+'_train_mrl1_e0_ff2048-X.npy') # 1281167, 2048
fileName = f'../build/data/{CONFIG}-resnet50/res/{DISKANN_INDEX}/d{dim}/opq{opq}_100_idx_uint32.bin'

with open(fileName, 'rb') as f:
    data = np.fromfile(f, dtype='<u4')
neighbors = data[2:].reshape(50000,-1) # 50K, 100

In [None]:
def rerank(rerank_dim, shortlist, neighbors):
    """ Return shortlist of 2048-NN reranked with D_s and retrieved with D_r 
    
    Keyword arguments:
    rerank_dim -- dimensionality at which to rerank shortlist of k-NN
    shortlist -- length of k-NN retrieved in list
    neighbors -- array of k-NN
    """
    
    for i in range(len(rerank_dim)):
        db_rerank_new = db_rerank[:, :rerank_dim[i]]
        neighbors_new = neighbors[:, :shortlist[i]]

        # iterate over every query and re-order 2048-NN based on rerank_dim representation distances
        for j in range(len(neighbors)):
            query_vector = queries[j][:rerank_dim[i]]
            nn_indices = neighbors_new[j][:shortlist[i]]

            NN_vectors_higher_dim = normalize(db_rerank_new[nn_indices].squeeze(), axis=1)
            L2_distances_reranked = np.linalg.norm(NN_vectors_higher_dim - query_vector[:rerank_dim[i]], axis=1)

            reranked_neighbor_indices = np.argsort(L2_distances_reranked)
            reranked_neighbors = neighbors_new[j, reranked_neighbor_indices]
            neighbors_new[j] = reranked_neighbors

        neighbors = neighbors_new
    return neighbors