In [46]:
import pandas as pd
import numpy as np

Unnamed: 0,dataset,scale,metric,cardinality,dimensionality,index_build_time,tuning_time,k,algorithm,throughput,recall
0,sift,,euclidean,1000000,128,6.813441,0.0,10,GreedySieve,44.571674,1.0
1,sift,,euclidean,1000000,128,6.813441,0.0,10,Sieve,196.2273,1.0
2,sift,,euclidean,1000000,128,6.813441,0.0,10,SieveSepCenter,185.17111,1.0
3,sift,,euclidean,1000000,128,6.813441,0.0,10,RepeatedRnn,54.365353,1.0
4,sift,,euclidean,1000000,128,6.813441,0.0,10,Linear,551.89386,1.0


In [118]:
data = pd.read_csv('scaling-results.csv')
df = pd.DataFrame(data)

# Function to determine if row contains data for a CAKES algorithm or a rival algorithm
def cakes_alg(row): 
    if row['algorithm'] in ['GreedySieve', 'Sieve', 'SieveSepCenter', 'RepeatedRnn', 'Linear']: 
        return True 
    else: 
        return False


# Temporary field to allow sorting by on CAKES vs non-CAKES 
df['cakes_alg'] = df.apply(cakes_alg, axis =1)

# We only want k == 10 here
df = df[df.k == 10]

# Get rid of unnecessary columns 
df.drop(columns = ['metric', 'cardinality', 'dimensionality', 'tuning_time', 'index_build_time', 'k'], inplace = True)

# Sometimes the 1 multiplier rows got screwed up 
df = df.fillna(1.0)

# This is what we ultimately want as columns in the tables 
# Really, we want to have split columns for each algorithm, but my brain is too small to do that here
# so I am just going to condense by hand later
columns = ['faiss-ivf-flat(QPS)', 'faiss-ivf-flat(R)', 'faiss-flat(QPS)', 'faiss-flat(R)', 'annoy(QPS)', 'annoy(R)', 'CAKES(QPS)', 'CAKES(R)']

algorithms = ['faiss-ivf-flat', 'faiss-flat', 'annoy', 'CAKES']

# Each dataset gets its own table 
for (dataset, dataset_df) in df.groupby('dataset'): 

    # For each dataset, we show QPS and recall for each algorithm at each scale from 2**0 through 2**9
    results_table = np.zeros((10, 2*len(algorithms)))

    # We separate by scale so that at each scale, we can determine which of the 5 CAKES algorithms was best
    for index, (scale, scale_df) in enumerate(dataset_df.groupby('scale')): 

        # We separate between CAKES and non-CAKES algorithms 
        for (cakes_bool, alg_df) in scale_df.groupby('cakes_alg'): 
            # 
            if cakes_bool: 
                max_throughput = alg_df['throughput'].max()
                recall = alg_df['recall'].max()
                results_table[index][6] = max_throughput 
                results_table[index][7] = recall 
                
            else: 
                for i, (_, row) in enumerate(alg_df.iterrows()): 
                    results_table[index][2*i] = row.throughput 
                    results_table[index][2*i+1] = row.recall

                
    new_df = pd.DataFrame(data=results_table, columns=columns)
    print(new_df.to_latex(header = ['faiss-ivf-flat(QPS)', 'faiss-ivf-flat(R)', 'faiss-flat(QPS)', 'faiss-flat(R)', 'annoy(QPS)', 'annoy(R)', 'CAKES(QPS)', 'CAKES(R)'], 
            index = False))
    


\begin{tabular}{rrrrrrrr}
\toprule
faiss-ivf-flat(QPS) & faiss-ivf-flat(R) & faiss-flat(QPS) & faiss-flat(R) & annoy(QPS) & annoy(R) & CAKES(QPS) & CAKES(R) \\
\midrule
2191.849364 & 0.949727 & 212.602278 & 1.000000 & 2009.483440 & 1.000000 & 2167.365000 & 1.000000 \\
2118.513407 & 0.926636 & 106.372153 & 1.000000 & 939.169025 & 0.999909 & 1141.224700 & 1.000000 \\
2043.628270 & 0.897667 & 461.015818 & 0.996833 & 53.077121 & 1.000000 & 982.117100 & 1.000000 \\
225.827100 & 0.995333 & 26.577270 & 1.000000 & 1925.592809 & 0.857333 & 1179.813400 & 1.000000 \\
116.960148 & 0.991000 & 13.292249 & 1.000000 & 1841.862831 & 0.861500 & 1201.409400 & 1.000000 \\
6.647092 & 1.000000 & 1850.942695 & 0.775000 & 59.147868 & 0.985000 & 1157.667000 & 1.000000 \\
1787.142346 & 0.677000 & 3.323591 & 1.000000 & 26.068382 & 0.968000 & 1102.849400 & 1.000000 \\
1659.660152 & 0.538000 & 13.265281 & 0.964000 & 1.661720 & 1.000000 & 1039.607000 & 1.000000 \\
6.650268 & 0.962000 & 1597.263598 & 0.592000 & 0.83