In [6]:
import pandas as pd
import numpy as np

In [22]:
data = pd.read_csv('scaling-results.csv')
df = pd.DataFrame(data)

# Function to determine if row contains data for a CAKES algorithm or a rival algorithm
def cakes_alg(row): 
    if row['algorithm'] in ['GreedySieve', 'Sieve', 'SieveSepCenter', 'RepeatedRnn', 'Linear']: 
        return True 
    else: 
        return False


def alg_order(col): 
    match col:
        case 'faiss-ivf-flat':
            return 0
        case 'faiss-flat':
            return 1
        case 'annoy':
            return 2
        case 'CAKES':
            return 3


# Temporary field to allow sorting by on CAKES vs non-CAKES 
df['cakes_alg'] = df.apply(cakes_alg, axis =1)

# We only want k == 10 here
df = df[df.k == 10]

# Get rid of unnecessary columns 
df.drop(columns = ['metric', 'cardinality', 'dimensionality', 'tuning_time', 'index_build_time', 'k'], inplace = True)

# Sometimes the 1 multiplier rows got screwed up 
df = df.fillna(1.0)

# This is what we ultimately want as columns in the tables 
# Really, we want to have split columns for each algorithm, but my brain is too small to do that here
# so I am just going to condense by hand later
columns = ['scaling-factor', 'faiss-ivf-flat(QPS)', 'faiss-ivf-flat(R)', 'faiss-flat(QPS)', 'faiss-flat(R)', 'annoy(QPS)', 'annoy(R)', 'CAKES(QPS)', 'CAKES(R)']
algorithms = ['faiss-ivf-flat', 'faiss-flat', 'annoy', 'CAKES']

# Each dataset gets its own table 
for (dataset, dataset_df) in df.groupby('dataset'): 
    print(dataset)
    # For each dataset, we show QPS and recall for each algorithm at each scale from 2**0 through 2**9
    results_table = np.zeros((10, 2*len(algorithms)+1))

    # We separate by scale so that at each scale, we can determine which of the 5 CAKES algorithms was best
    for index, (scale, scale_df) in enumerate(dataset_df.groupby('scale')): 

        # We separate between CAKES and non-CAKES algorithms 
        for (cakes_bool, alg_df) in scale_df.groupby('cakes_alg'): 
            results_table[index][0] = scale
            if cakes_bool: 
                max_throughput = alg_df['throughput'].max()
                recall = alg_df['recall'].max()
                results_table[index][7] = max_throughput
                results_table[index][8] = recall
                
            else: 
                alg_df.sort_values(by='algorithm', key=lambda col: col.str.len(), inplace = True)
                for i, (_, row) in enumerate(alg_df.iterrows()): 
                    results_table[index][2*i+1] = row.throughput

                    results_table[index][2*i+2] = row.recall

                
    new_df = pd.DataFrame(data=results_table, columns=columns)
    print(new_df.to_latex(header = columns, 
            index = False, 
            float_format = "{:.3f}".format ))
    


fashion-mnist
           dataset  scale       algorithm   throughput    recall  cakes_alg
316  fashion-mnist    1.0           annoy  2191.849364  0.949727      False
347  fashion-mnist    1.0      faiss-flat   212.602278  1.000000      False
354  fashion-mnist    1.0  faiss-ivf-flat  2009.483440  1.000000      False
           dataset  scale       algorithm   throughput    recall  cakes_alg
302  fashion-mnist    2.0           annoy  2118.513407  0.926636      False
376  fashion-mnist    2.0      faiss-flat   106.372153  1.000000      False
379  fashion-mnist    2.0  faiss-ivf-flat   939.169025  0.999909      False
           dataset  scale       algorithm   throughput    recall  cakes_alg
323  fashion-mnist    4.0           annoy  2043.628270  0.897667      False
352  fashion-mnist    4.0      faiss-flat    53.077121  1.000000      False
344  fashion-mnist    4.0  faiss-ivf-flat   461.015818  0.996833      False
           dataset  scale       algorithm   throughput    recall  cakes_al