This notebook collects query time, recall, and memory use in the inverted file index solution

In [1]:
import pickle
import collections
import matplotlib.pyplot as plt
import faiss
import numpy as np
import itertools
import statistics
import matplotlib.cm as cm
import time
import os
from time import perf_counter_ns

In [2]:
# read in embeddings and cluster label info
with open('embedding64.pickle', 'rb') as fp:
    embedding64 = pickle.load(fp)
with open('label_info.pickle', 'rb') as fp:
    label_info = pickle.load(fp)

print(embedding64.keys())
print(label_info.keys())

dict_keys(['embed_all', 'embed_raw', 'embed_l2_norm', 'restore_order', 'embed_correct_coverage_fh', 'embed_l2_norm_correct_coverage_fh'])
dict_keys(['batch id', 'age', 'total_cg', 'average_cg_rate', 'total_ch', 'average_ch_rate', 'hic_counts', 'cell_name_higashi', 'major', 'minor', 'cluster label', 'cluster label minor'])


In [3]:
# create helpful dicts for the cluster labels
# also look at n counts for the clusters
major_labels = list(set(label_info['cluster label']))
minor_labels = list(set(label_info['cluster label minor']))

major = dict(zip(major_labels,[[] for item in major_labels]))
minor = dict(zip(minor_labels,[[] for item in minor_labels]))

true_major = label_info['cluster label']
true_minor = label_info['cluster label minor']

for j in range(len(label_info['cluster label'])):
    maj = label_info['cluster label'][j]
    min = label_info['cluster label minor'][j]

    if maj in major.keys():
        major[maj].append(j)
    if min in minor.keys():
        minor[min].append(j)

print("MAJOR CLUSTERS")
for key in major.keys():
    s = "    " + key + ": " + str(len(major[key]))
    print(s)
    
print("MINOR CLUSTERS")
for key in minor.keys():
    s = "    " + key + ": " + str(len(minor[key]))
    print(s)

MAJOR CLUSTERS
    Sst: 217
    L2/3: 551
    Vip: 171
    ODC: 1245
    L5: 180
    L6: 86
    MG: 422
    Pvalb: 134
    Ndnf: 144
    Endo: 205
    MP: 100
    Astro: 449
    L4: 131
    OPC: 203
MINOR CLUSTERS
    L6-2: 19
    Endo-2: 69
    Endo-3: 85
    Astro-1: 449
    L2/3-1: 137
    MP-1: 100
    Vip-1: 45
    Vip-2: 126
    MG-1: 422
    OPC-1: 203
    ODC-1: 810
    L5-1: 48
    L4-1: 131
    Ndnf-2: 63
    L5-3: 58
    L6-1: 67
    Pvalb-2: 21
    Pvalb-1: 113
    Sst-1: 50
    L2/3-2: 137
    L2/3-3: 127
    Endo-1: 51
    L2/3-4: 150
    Sst-2: 107
    Sst-3: 60
    L5-2: 74
    Ndnf-1: 59
    ODC-2: 435
    Ndnf-3: 22


In [4]:
# function to get memory footprint for index
# source: https://www.pinecone.io/learn/series/faiss/product-quantization/
def get_memory(index):
    faiss.write_index(index,'./temp.index')
    file_size = os.path.getsize('./temp.index')
    os.remove('./temp.index')
    return file_size

In [5]:
# function to calculate recall based on search results
def get_recall_min(i, result_min):
    # recall: TP / cluster size
    return len(set(minor[true_minor[i]]).intersection(result_min)) / len(minor[true_minor[i]])

def get_recall_maj(i, result_maj):
    # recall: TP / cluster size
    return len(set(major[true_major[i]]).intersection(result_maj)) / len(major[true_major[i]])

In [6]:
# create input database
database = np.array(embedding64["embed_l2_norm"]) 

In [7]:
def query_rep(index, query, k):
    start1 = perf_counter_ns()
    for x in range(100):
        D, I = index.search(query, k)
    end1 = perf_counter_ns()
        
    start2 = perf_counter_ns()
    for x in range(100):
        D, I = index.search(query, k)
    end2 = perf_counter_ns()
        
    start3 = perf_counter_ns()
    for x in range(100):
        D, I = index.search(query, k)
    end3 = perf_counter_ns()

    times = [(end1-start1),(end2-start2),(end3-start3)]
    times.sort()

    return I, times

In [8]:
# define experiment function
def IVF_flat_experiment(nlist):
    # initialize empty arrays for results
    recall_min = np.zeros([32,4238])
    recall_maj = np.zeros([32,4238])
    speed_min = np.zeros([32,4238])
    speed_maj = np.zeros([32,4238])
    memory = np.zeros([4238])
    
    for i in range(4238):
        # subset dataset
        db_subset = np.delete(database, i, 0)
        query = np.array([database[i]])
        k_major = len(major[true_major[i]]) # size of true cluster (how many neighbors to return)
        k_minor = len(minor[true_minor[i]]) # size of true cluster (how many neighbors to return)
        
        # create and train index
        quantizer = faiss.IndexFlatL2(64)
        index = faiss.IndexIVFFlat(quantizer, 64, nlist)
        index.train(db_subset)
        index.add(database)

        # how much memory is used?
        memory[i] = get_memory(index)
                
        # query index using nprobe values 1-16
        recall_min_temp=[]
        speed_min_temp=[]
        recall_maj_temp=[]
        speed_maj_temp=[]
        
        for n in range(1,33):
            index.nprobe = n    

            # major cluster query
            I, time = query_rep(index, query, k_major)
            recall_maj_temp.append(get_recall_maj(i, I[0]))
            speed_maj_temp.append(time[0])

            # minor cluster query
            I, time = query_rep(index, query, k_minor)
            recall_min_temp.append(get_recall_min(i, I[0]))
            speed_min_temp.append(time[0])

        # save results 
        recall_min[:,i]=recall_min_temp
        speed_min[:,i]=speed_min_temp
        recall_maj[:,i]=recall_maj_temp
        speed_maj[:,i]=speed_maj_temp  

    # return all results
    return recall_min, recall_maj, speed_min, speed_maj, memory
    

In [9]:
### EXPERIMENT A: NLIST=17
recall_min_A, recall_maj_A, speed_min_A, speed_maj_A, memory_A = IVF_flat_experiment(17)

with open('IVFFLAT/ExperimentA.npy', 'wb') as f:
    np.save(f, recall_min_A)
    np.save(f, recall_maj_A)
    np.save(f, speed_min_A)
    np.save(f, speed_maj_A)
    np.save(f, memory_A)

# with open('IVFFLAT/ExperimentA.npy', 'rb') as f:
#     recall_min_A = np.load(f)
#     recall_maj_A = np.load(f)
#     speed_min_A = np.load(f)
#     speed_maj_A = np.load(f)
#     memory_A = np.load(f)

In [10]:
### EXPERIMENT B: NLIST=47
recall_min_B, recall_maj_B, speed_min_B, speed_maj_B, memory_B = IVF_flat_experiment(47)

with open('IVFFLAT/ExperimentB.npy', 'wb') as f:
    np.save(f, recall_min_B)
    np.save(f, recall_maj_B)
    np.save(f, speed_min_B)
    np.save(f, speed_maj_B)
    np.save(f, memory_B)

# with open('IVFFLAT/ExperimentB.npy', 'rb') as f:
#     recall_min_B = np.load(f)
#     recall_maj_B = np.load(f)
#     speed_min_B = np.load(f)
#     speed_maj_B = np.load(f)
#     memory_B = np.load(f)

In [11]:
### EXPERIMENT C: NLIST=77
recall_min_C, recall_maj_C, speed_min_C, speed_maj_C, memory_C = IVF_flat_experiment(77)

with open('IVFFLAT/ExperimentC.npy', 'wb') as f:
    np.save(f, recall_min_C)
    np.save(f, recall_maj_C)
    np.save(f, speed_min_C)
    np.save(f, speed_maj_C)
    np.save(f, memory_C)

# with open('IVFFLAT/ExperimentC.npy', 'rb') as f:
#     recall_min_C = np.load(f)
#     recall_maj_C = np.load(f)
#     speed_min_C = np.load(f)
#     speed_maj_C = np.load(f)
#     memory_C = np.load(f)

In [13]:
### EXPERIMENT D: NLIST=107
recall_min_D, recall_maj_D, speed_min_D, speed_maj_D, memory_D = IVF_flat_experiment(107)

with open('IVFFLAT/ExperimentD.npy', 'wb') as f:
    np.save(f, recall_min_D)
    np.save(f, recall_maj_D)
    np.save(f, speed_min_D)
    np.save(f, speed_maj_D)
    np.save(f, memory_D)

# with open('IVFFLAT/ExperimentD.npy', 'rb') as f:
#     recall_min_D = np.load(f)
#     recall_maj_D = np.load(f)
#     speed_min_D = np.load(f)
#     speed_maj_D = np.load(f)
#     memory_D = np.load(f)

In [14]:
### EXPERIMENT E: NLIST=137
recall_min_E, recall_maj_E, speed_min_E, speed_maj_E, memory_E = IVF_flat_experiment(137)

with open('IVFFLAT/ExperimentE.npy', 'wb') as f:
    np.save(f, recall_min_E)
    np.save(f, recall_maj_E)
    np.save(f, speed_min_E)
    np.save(f, speed_maj_E)
    np.save(f, memory_E)

# with open('IVFFLAT/ExperimentE.npy', 'rb') as f:
#     recall_min_E = np.load(f)
#     recall_maj_E = np.load(f)
#     speed_min_E = np.load(f)
#     speed_maj_E = np.load(f)
#     memory_E = np.load(f)