In [42]:
import pandas as pd
import numpy as np
import math

In [52]:
data = pd.read_csv('scaling-results.csv')
df = pd.DataFrame(data)

# print(df.head())

# Function to determine if row contains data for a CAKES algorithm or a rival algorithm

algorithms = ['hnsw', 'annoy', 'faiss-flat', 'faiss-ivf-flat', 'CAKES']

def cakes_alg(row): 
    cakes_algs = ['GreedySieve', 'Sieve', 'SieveSepCenter', 'RepeatedRnn', 'Linear']
    if row['algorithm'] in cakes_algs: 
        return True 
    else: 
        return False


# Temporary field to allow paritioning by CAKES vs non-CAKES 
df['cakes_alg'] = df.apply(cakes_alg, axis =1)

# We only want k == 10 here
df = df[df.k == 10]

# Get rid of unnecessary columns 
df.drop(columns = ['metric', 'cardinality', 'dimensionality',  'index_build_time', 'k'], inplace = True)

# Sometimes the 1 multiplier rows got screwed up 
df = df.fillna(1.0)

# This is what we ultimately want as columns in the tables 
# Really, we want to have split columns for each algorithm, but my brain is too small to do that here
# so I am just going to condense by hand later
columns = ['scale'] 
for x in algorithms: 
    columns.append(x+ '(QPS)')
    columns.append(x + '(R)')
    


# Each dataset gets its own table 
for (dataset, dataset_df) in df.groupby('dataset'): 
    
    print(dataset)
    
    # For each dataset, we show QPS and recall for each algorithm at each scale from 2**0 through 2**9
    max_scale = int(math.log2(df['scale'].max()))
    results_table = np.zeros((max_scale+1, 2*len(algorithms)+1))

    # We separate by scale so that at each scale, we can determine which of the 5 CAKES algorithms was best
    for index, (scale, scale_df) in enumerate(dataset_df.groupby('scale')): 

        # We separate between CAKES and non-CAKES algorithms 
        for (cakes_bool, alg_df) in scale_df.groupby('cakes_alg'): 
            results_table[index][0] = scale
            if cakes_bool: 
                max_throughput = alg_df['throughput'].max()
                recall = alg_df['recall'].max()
                results_table[index][2*len(algorithms)-1] = max_throughput
                results_table[index][2*len(algorithms)] = recall
                
            else: 

                alg_df.set_index("algorithm", inplace=True)
                for i in range(4): 
                    try:
                        row = alg_df.loc[algorithms[i]]
                        results_table[index][2*i+1] = row.throughput
                        results_table[index][2*i+2] = row.recall
                    except: 
                        results_table[index][2*i+1] = 42 # this is hacky because really it should be NaN but it gets mad if I do that so I needed to put a float
                        results_table[index][2*i+2] = 42 # changed manually to -- in paper
                    
                
    new_df = pd.DataFrame(data=results_table, columns=columns)
    print(new_df.to_latex(header = columns, 
            index = False, 
            float_format = "{:.3f}".format ))
    


fashion-mnist
\begin{tabular}{rrrrrrrrrrr}
\toprule
scale & hnsw(QPS) & hnsw(R) & annoy(QPS) & annoy(R) & faiss-flat(QPS) & faiss-flat(R) & faiss-ivf-flat(QPS) & faiss-ivf-flat(R) & CAKES(QPS) & CAKES(R) \\
\midrule
1.000 & 13330.275 & 0.954 & 2191.849 & 0.950 & 212.602 & 1.000 & 2009.483 & 1.000 & 2167.365 & 1.000 \\
2.000 & 13822.528 & 0.803 & 2118.513 & 0.927 & 106.372 & 1.000 & 939.169 & 1.000 & 1141.225 & 1.000 \\
4.000 & 16648.524 & 0.681 & 2043.628 & 0.898 & 53.077 & 1.000 & 461.016 & 0.997 & 982.117 & 1.000 \\
8.000 & 16785.981 & 0.525 & 1925.593 & 0.857 & 26.577 & 1.000 & 225.827 & 0.995 & 1179.813 & 1.000 \\
16.000 & 18717.175 & 0.494 & 1841.863 & 0.862 & 13.292 & 1.000 & 116.960 & 0.991 & 1201.409 & 1.000 \\
32.000 & 15597.778 & 0.542 & 1850.943 & 0.775 & 6.647 & 1.000 & 59.148 & 0.985 & 1157.667 & 1.000 \\
64.000 & 14997.277 & 0.378 & 1787.142 & 0.677 & 3.324 & 1.000 & 26.068 & 0.968 & 1102.849 & 1.000 \\
128.000 & 14866.019 & 0.357 & 1659.660 & 0.538 & 1.662 & 1.000 & 13.2