In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import os
import time

## Configuration Variables

In [2]:
model = "mrl/" # mrl, mrl_e, ff
root = "../inference/"
use_funnel = True
index_type = 'exactl2' # exactl2, hnsw8, hnsw32
dataset = '1K' # 1K, 4K, V2

if not use_funnel:
    nn_dir = root+"neighbors/reranked/"+model
    rerank_dim = [16]
    shortlist = [200]
else:
    nn_dir = root+"neighbors/funnel_retrieval/"+model
    # rerank_dim: scale at which neighbors will be re-ordered based on L2 distance
    rerank_dim = [16, 32, 64, 128, 2048] # rerank cascade
    # shortlist_set: set of shortlist cascades
    shortlist_set = [[800,400,200,50,10], [400,200,50,25,10], [200,100,50,25,10]] 

max_rerank_dim = 2048 # maximum dimensionality at which reranking may occur, usually = 2048
retrieval_dim = 8 # scale at which to retrieve 2048-NN for all samples in query set

if not os.path.isdir(nn_dir):
    os.makedirs(nn_dir)

## Load Database and Query vectors

In [3]:
if model == 'mrl/':
    config = 'mrl1_e0_ff2048'
elif model == 'mrl_e/':
    config = 'mrl0_e1_ff2048'
elif model == 'ff/':
    config = 'mrl0_e0_ff16'
else: 
    raise Exception("Unsupported pretrained model.")

db_csv = dataset + '_train_' + config + '-X.npy' # naming format as in R50_inference.py
query_csv = dataset + '_val_' + config + '-X.npy'

start = time.time()
db_rerank = np.load(root+db_csv)[:, :max_rerank_dim]
end = time.time() - start
print("Load database vectors (%d x %d), time= %f" % (db_rerank.shape[0], db_rerank.shape[1], end))

start = time.time()
queries = np.load(root+query_csv)[:, :max_rerank_dim]
end = time.time() - start
print("Load query vectors (%d x %d), time= %f" % (queries.shape[0], queries.shape[1], end))

start = time.time()
queries = normalize(queries, axis=1)
db_rerank = normalize(db_rerank, axis=1)
end = time.time() - start
print("Normalization time= %f" % (end))

Load database vectors (1281167 x 2048), time= 3.465177
Load query vectors (50000 x 2048), time= 0.142086
Normalization time= 9.972036


## Load k-NN array

In [4]:
start = time.time()
NN_file = root+"neighbors/"+model+index_type+"_"+ str(retrieval_dim)+"dim_2048shortlist_"+dataset+".csv"
neighbors = pd.read_csv(NN_file, header=None).to_numpy()
end = time.time() - start
print("Loaded %s : (%d x %d), time= %f" % (NN_file.split("/")[-1], neighbors.shape[0], neighbors.shape[1], end))

Loaded exactl2_8dim_2048shortlist_1K.csv : (50000 x 2048), time= 13.080295


In [5]:
print("\nDB for reranking: ", db_rerank.shape)
print("Queries for reranking: ", queries.shape)
print("k-NN array: ", neighbors.shape)


DB for reranking:  (1281167, 2048)
Queries for reranking:  (50000, 2048)
k-NN array:  (50000, 2048)


In [6]:
def rerank(use_funnel, rerank_dim, shortlist, neighbors):
    """ Return shortlist of 2048-NN reranked with D_s and retrieved with D_r 
    
    Keyword arguments:
    use_funnel -- boolean flag to rerank in a cascaded fashion via funnel retrieval
    rerank_dim -- dimensionality at which to rerank shortlist of k-NN
    shortlist -- length of k-NN retrieved
    neighbors -- array of k-NN indexed on db_csv
    """
    # ensure these match for funnel
    if use_funnel:
        assert len(rerank_dim) == len(shortlist)

    for i in range(len(rerank_dim)):
        db_rerank_new = db_rerank[:, :rerank_dim[i]]
        neighbors_new = neighbors[:, :shortlist[i]]

        # iterate over every query and re-order 2048-NN based on rerank_dim representation distances
        for j in range(len(neighbors)):
            query_vector = queries[j][:rerank_dim[i]]
            #print("Query vector: ", query_vector.shape)
            nn_indices = neighbors_new[j][:shortlist[i]]

            #NN_vectors_original = normalize(db_retrieval[nn_indices].squeeze(), axis = 1)
            NN_vectors_higher_dim = normalize(db_rerank_new[nn_indices].squeeze(), axis=1)
            #print("NN vector original and higher dim: ", NN_vectors_original.shape, NN_vectors_higher_dim.shape)

            #L2_distances_orig = np.linalg.norm(NN_vectors_original - query_vector[:retrieval_dim], axis=1)
            #print("Sorting at retrieval dim: ", np.argsort(L2_distances_orig)[:10]) #sanity test this should be 0, 1, 2 ...
            L2_distances_reranked = np.linalg.norm(NN_vectors_higher_dim - query_vector[:rerank_dim[i]], axis=1)
            #print("Sorting at rerank dim: ", np.argsort(L2_distances_reranked)[:10]) #reorder indices based on higher dim representations

            reranked_neighbor_indices = np.argsort(L2_distances_reranked)
            reranked_neighbors = neighbors_new[j, reranked_neighbor_indices]
            neighbors_new[j] = reranked_neighbors
        #print("DB rerank: ", db_rerank_new.shape)
        #print("Neighbors: ", neighbors_new.shape)
        neighbors = neighbors_new
    return neighbors

## Neighbors Shortlist Generation

In [7]:
if not use_funnel:
    # Retrieve k-NN array with D_r and rerank with D_s
    for dim in rerank_dim:
        start = time.time()
        neighbors_reranked = rerank(use_funnel, [dim], shortlist, neighbors)
        end = time.time() - start
        print("\nD_r = %d , D_s = %d, time = %f" % (retrieval_dim, dim, end))

        neighbors_df = pd.DataFrame(neighbors_reranked)
        print(neighbors_df.shape)

        filename = str(retrieval_dim)+"dim-reranked"+str(dim)+"_"+str(shortlist[0])+"shortlist_"+dataset+"_"+index_type+".csv"

        print("Saving config: ", filename)
        pd.DataFrame(neighbors_df).to_csv(nn_dir+filename, header=None, index=None)
        
else:
    # Funnel Retrieval (increase dims and reduce shortlist length in sync)
    for shortlist in shortlist_set:
        start = time.time()
        NN_cascade = rerank(1, rerank_dim, shortlist, neighbors)
        end = time.time() - start
        print("\nRetrieve @%d + funnel retrieval @%s with shortlist %s, time = %f" 
              % (retrieval_dim, rerank_dim, shortlist, end))

        neighbors_df = pd.DataFrame(NN_cascade)
        print(neighbors_df.shape)

        filename = str(retrieval_dim)+"dim-cascade"+str(rerank_dim)+"_"+"shortlist"+str(shortlist)+"_"+dataset+"_"+index_type+".csv"

        print("Saving config: ", filename)
        pd.DataFrame(neighbors_df).to_csv(nn_dir+filename, header=None, index=None)


Retrieve @8 + funnel retrieval @[16, 32, 64, 128, 2048] with shortlist [800, 400, 200, 50, 10], time = 53.174906
(50000, 10)
Saving config:  8dim-cascade[16, 32, 64, 128, 2048]_shortlist[800, 400, 200, 50, 10]_1K_exactl2.csv

Retrieve @8 + funnel retrieval @[16, 32, 64, 128, 2048] with shortlist [400, 200, 50, 25, 10], time = 39.799965
(50000, 10)
Saving config:  8dim-cascade[16, 32, 64, 128, 2048]_shortlist[400, 200, 50, 25, 10]_1K_exactl2.csv

Retrieve @8 + funnel retrieval @[16, 32, 64, 128, 2048] with shortlist [200, 100, 50, 25, 10], time = 35.106180
(50000, 10)
Saving config:  8dim-cascade[16, 32, 64, 128, 2048]_shortlist[200, 100, 50, 25, 10]_1K_exactl2.csv
