In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import os
import time

In [18]:
config = "ff/" # mrl, mrl_e, ff
root_dir = "/home/aniketr/Documents/imagenetv2/"
use_funnel = True
index_type = 'exactl2' # exactl2, hnsw8, hnsw32
dataset = 'imagenetv2' # imagenet1k, imagenetv2, imagenet4m

if not use_funnel:
    nn_dir = root_dir+"neighbors/reranked/"
    rerank_dim = [2048]
    shortlist = [200]
else:
    nn_dir = root_dir+"neighbors/funnel_retrieval/"
    # rerank_dim: scale at which neighbors will be re-ordered based on L2 distance
    rerank_dim = [16, 32, 64, 128, 2048] 
    # shortlist_set: set of corresponding shortlist lengths for reranking, 1-to-1 correspondence with rerank_dim
    shortlist_set = [[800,400,200,50,10], [400,200,50,25,10], [200,100,50,25,10]] 

max_rerank_dim = 2048 # maximum dimensionality at which reranking may occur, usually = 2048
retrieval_dim = 8 # scale at which to retrieve 2048-NN for all samples in query set

## Load knn array, database vectors, and query vectors

In [19]:
db_csv = dataset+'_val_nesting1_sh0_ff2048-X.npy'
query_csv = dataset+'_val_nesting1_sh0_ff2048-X.npy'

start = time.time()
db_rerank = np.load(root_dir+db_csv)[:, :max_rerank_dim]
end = time.time() - start
print("Load database vectors (%d x %d), time= %f" % (db_rerank.shape[0], db_rerank.shape[1], end))

start = time.time()
queries = np.load(root_dir+query_csv)[:, :max_rerank_dim]
end = time.time() - start
print("Load query vectors (%d x %d), time= %f" % (queries.shape[0], queries.shape[1], end))

start = time.time()
queries = normalize(queries, axis=1)
db_rerank = normalize(db_rerank, axis=1)
end = time.time() - start
print("Normalization time= %f" % (end))

Load database vectors (10000 x 2048), time= 0.047169
Load query vectors (10000 x 2048), time= 0.075541
Normalization time= 0.145540


## Modify below to avoid expensive file loads for 4M dataset

In [20]:
start = time.time()
NN_file = root_dir+"neighbors/"+index_type+"_"+ str(retrieval_dim)+"dim-2048-NN_"+dataset+".csv"
neighbors = pd.read_csv(NN_file, header=None).to_numpy()

end = time.time() - start
print("Loaded %s : (%d x %d), time= %f" % (NN_file.split("/")[-1], neighbors.shape[0], neighbors.shape[1], end))

Loaded exactl2_8dim-2048-NN_imagenetv2.csv : (10000 x 2048), time= 1.090317


In [21]:
print("\nDB for reranking: ", db_rerank.shape)
print("Queries for reranking: ", queries.shape)
print("k-NN array: ", neighbors.shape)


DB for reranking:  (10000, 2048)
Queries for reranking:  (10000, 2048)
k-NN array:  (10000, 2048)


# Naive Routing/Cascading Strategy

In [22]:
def rerank(use_funnel, rerank_dim, shortlist, neighbors):
    
    # ensure these match for naive routing strategy
    if use_funnel:
        assert len(rerank_dim) == len(shortlist)

    for i in range(len(rerank_dim)):
        db_rerank_new = db_rerank[:, :rerank_dim[i]]
        neighbors_new = neighbors[:, :shortlist[i]]

        # iterate over every query and re-order 2048-NN based on rerank_dim representation distances
        for j in range(len(neighbors)):
            query_vector = queries[j][:rerank_dim[i]]
            #print("Query vector: ", query_vector.shape)
            nn_indices = neighbors_new[j][:shortlist[i]]

            #NN_vectors_original = normalize(db_retrieval[nn_indices].squeeze(), axis = 1)
            NN_vectors_higher_dim = normalize(db_rerank_new[nn_indices].squeeze(), axis=1)
            #print("NN vector original and higher dim: ", NN_vectors_original.shape, NN_vectors_higher_dim.shape)

            #L2_distances_orig = np.linalg.norm(NN_vectors_original - query_vector[:retrieval_dim], axis=1)
            #print("Sorting at retrieval dim: ", np.argsort(L2_distances_orig)[:10]) #sanity test this should be 0, 1, 2 ...
            L2_distances_reranked = np.linalg.norm(NN_vectors_higher_dim - query_vector[:rerank_dim[i]], axis=1)
            #print("Sorting at rerank dim: ", np.argsort(L2_distances_reranked)[:10]) #reorder indices based on higher dim representations

            reranked_neighbor_indices = np.argsort(L2_distances_reranked)
            reranked_neighbors = neighbors_new[j, reranked_neighbor_indices]
            neighbors_new[j] = reranked_neighbors
        #print("DB rerank: ", db_rerank_new.shape)
        #print("Neighbors: ", neighbors_new.shape)
        neighbors = neighbors_new
    return neighbors

## Rerank over rerank_dim list for fixed shortlist length k. Retrieval dim is also fixed and loaded from NN.csv 

In [23]:
for dim in rerank_dim:
    start = time.time()
    neighbors_reranked = rerank(use_funnel, [dim], shortlist, neighbors)
    end = time.time() - start
    print("\nRetrieve @%d + rerank@%d, time = %f" % (retrieval_dim, dim, end))

    neighbors_df = pd.DataFrame(neighbors_reranked)
    print(neighbors_df.shape)

    if not os.path.isdir(nn_dir):
        os.makedirs(nn_dir)

    filename = str(retrieval_dim)+"dim-reranked"+str(dim)+"_"+str(shortlist[0])+"shortlist_"+dataset+"_"+index_type+".csv"

    print("Saving config: ", filename)
    pd.DataFrame(neighbors_df).to_csv(nn_dir+filename, header=None, index=None)


Retrieve @8 + rerank@16, time = 0.775230
(10000, 200)
Saving config:  8dim-reranked16_200shortlist_imagenetv2_exactl2.csv

Retrieve @8 + rerank@32, time = 0.814424
(10000, 200)
Saving config:  8dim-reranked32_200shortlist_imagenetv2_exactl2.csv

Retrieve @8 + rerank@64, time = 0.882801
(10000, 200)
Saving config:  8dim-reranked64_200shortlist_imagenetv2_exactl2.csv

Retrieve @8 + rerank@128, time = 1.074831
(10000, 200)
Saving config:  8dim-reranked128_200shortlist_imagenetv2_exactl2.csv

Retrieve @8 + rerank@2048, time = 6.858704
(10000, 200)
Saving config:  8dim-reranked2048_200shortlist_imagenetv2_exactl2.csv


## Funnel Retrieval (increase dims and reduce shortlist length in sync)

In [26]:
if use_funnel:
    for shortlist in shortlist_set:
        start = time.time()
        NN_cascade = rerank(1, rerank_dim, shortlist, neighbors)
        end = time.time() - start
        print("\nRetrieve @%d + funnel retrieval @%s with shortlist %s, time = %f" 
              % (retrieval_dim, rerank_dim, shortlist, end))

        neighbors_df = pd.DataFrame(NN_cascade)
        print(neighbors_df.shape)

        filename = str(retrieval_dim)+"dim-cascade"+str(rerank_dim)+"_"+str(shortlist)
        +"shortlist_"+dataset+"_"+index_type+".csv"

        print("Saving config: ", filename)
        pd.DataFrame(neighbors_df).to_csv(nn_dir+filename, header=None, index=None)


Retrieve @8 + funnel retrieval @[16, 32, 64, 128, 2048] with shortlist [800, 400, 200, 50, 10], time = 5.074842
(10000, 10)


TypeError: bad operand type for unary +: 'str'